src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. CCL handlers
  29   6. End-of-line handlers
  30   7. C library functions
  31   8. Emacs Lisp library functions
  32   9. Post-amble
  33
  34 */
  35
  36 /*** GENERAL NOTE on CODING SYSTEM ***
  37
  38   Coding system is an encoding mechanism of one or more character
  39   sets.  Here's a list of coding systems which Emacs can handle.  When
  40   we say "decode", it means converting some other coding system to
  41   Emacs' internal format (emacs-internal), and when we say "encode",
  42   it means converting the coding system emacs-mule to some other
  43   coding system.
  44
  45   0. Emacs' internal format (emacs-mule)
  46
  47   Emacs itself holds a multi-lingual character in a buffer and a string
  48   in a special format.  Details are described in section 2.
  49
  50   1. ISO2022
  51
  52   The most famous coding system for multiple character sets.  X's
  53   Compound Text, various EUCs (Extended Unix Code), and coding
  54   systems used in Internet communication such as ISO-2022-JP are
  55   all variants of ISO2022.  Details are described in section 3.
  56
  57   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  58
  59   A coding system to encode character sets: ASCII, JISX0201, and
  60   JISX0208.  Widely used for PC's in Japan.  Details are described in
  61   section 4.
  62
  63   3. BIG5
  64
  65   A coding system to encode character sets: ASCII and Big5.  Widely
  66   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  67   described in section 4.  In this file, when we write "BIG5"
  68   (all uppercase), we mean the coding system, and when we write
  69   "Big5" (capitalized), we mean the character set.
  70
  71   4. Raw text
  72
  73   A coding system for a text containing random 8-bit code.  Emacs does
  74   no code conversion on such a text except for end-of-line format.
  75
  76   5. Other
  77
  78   If a user wants to read/write a text encoded in a coding system not
  79   listed above, he can supply a decoder and an encoder for it in CCL
  80   (Code Conversion Language) programs.  Emacs executes the CCL program
  81   while reading/writing.
  82
  83   Emacs represents a coding system by a Lisp symbol that has a property
  84   `coding-system'.  But, before actually using the coding system, the
  85   information about it is set in a structure of type `struct
  86   coding_system' for rapid processing.  See section 6 for more details.
  87
  88 */
  89
  90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  91
  92   How end-of-line of a text is encoded depends on a system.  For
  93   instance, Unix's format is just one byte of `line-feed' code,
  94   whereas DOS's format is two-byte sequence of `carriage-return' and
  95   `line-feed' codes.  MacOS's format is usually one byte of
  96   `carriage-return'.
  97
  98   Since text characters encoding and end-of-line encoding are
  99   independent, any coding system described above can take
 100   any format of end-of-line.  So, Emacs has information of format of
 101   end-of-line in each coding-system.  See section 6 for more details.
 102
 103 */
 104
 105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 106
 107   These functions check if a text between SRC and SRC_END is encoded
 108   in the coding system category XXX.  Each returns an integer value in
 109   which appropriate flag bits for the category XXX is set.  The flag
 110   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 111   template of these functions.  */
 112 #if 0
 113 int
 114 detect_coding_emacs_mule (src, src_end)
 115      unsigned char *src, *src_end;
 116 {
 117   ...
 118 }
 119 #endif
 120
 121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 122
 123   These functions decode SRC_BYTES length text at SOURCE encoded in
 124   CODING to Emacs' internal format (emacs-mule).  The resulting text
 125   goes to a place pointed to by DESTINATION, the length of which
 126   should not exceed DST_BYTES.  These functions set the information of
 127   original and decoded texts in the members produced, produced_char,
 128   consumed, and consumed_char of the structure *CODING.
 129
 130   The return value is an integer (CODING_FINISH_XXX) indicating how
 131   the decoding finished.
 132
 133   DST_BYTES zero means that source area and destination area are
 134   overlapped, which means that we can produce a decoded text until it
 135   reaches at the head of not-yet-decoded source text.
 136
 137   Below is a template of these functions.  */
 138 #if 0
 139 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 140      struct coding_system *coding;
 141      unsigned char *source, *destination;
 142      int src_bytes, dst_bytes;
 143 {
 144   ...
 145 }
 146 #endif
 147
 148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 149
 150   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 151   internal format (emacs-mule) to CODING.  The resulting text goes to
 152   a place pointed to by DESTINATION, the length of which should not
 153   exceed DST_BYTES.  These functions set the information of
 154   original and encoded texts in the members produced, produced_char,
 155   consumed, and consumed_char of the structure *CODING.
 156
 157   The return value is an integer (CODING_FINISH_XXX) indicating how
 158   the encoding finished.
 159
 160   DST_BYTES zero means that source area and destination area are
 161   overlapped, which means that we can produce a decoded text until it
 162   reaches at the head of not-yet-decoded source text.
 163
 164   Below is a template of these functions.  */
 165 #if 0
 166 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 167      struct coding_system *coding;
 168      unsigned char *source, *destination;
 169      int src_bytes, dst_bytes;
 170 {
 171   ...
 172 }
 173 #endif
 174
 175 /*** COMMONLY USED MACROS ***/
 176
 177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 178    THREE_MORE_BYTES safely get one, two, and three bytes from the
 179    source text respectively.  If there are not enough bytes in the
 180    source, they jump to `label_end_of_loop'.  The caller should set
 181    variables `src' and `src_end' to appropriate areas in advance.  */
 182
 183 #define ONE_MORE_BYTE(c1)       \
 184   do {                          \
 185     if (src < src_end)          \
 186       c1 = *src++;              \
 187     else                        \
 188       goto label_end_of_loop;   \
 189   } while (0)
 190
 191 #define TWO_MORE_BYTES(c1, c2)  \
 192   do {                          \
 193     if (src + 1 < src_end)      \
 194       c1 = *src++, c2 = *src++; \
 195     else                        \
 196       goto label_end_of_loop;   \
 197   } while (0)
 198
 199 #define THREE_MORE_BYTES(c1, c2, c3)            \
 200   do {                                          \
 201     if (src + 2 < src_end)                      \
 202       c1 = *src++, c2 = *src++, c3 = *src++;    \
 203     else                                        \
 204       goto label_end_of_loop;                   \
 205   } while (0)
 206
 207 /* The following three macros DECODE_CHARACTER_ASCII,
 208    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 209    the multi-byte form of a character of each class at the place
 210    pointed by `dst'.  The caller should set the variable `dst' to
 211    point to an appropriate area and the variable `coding' to point to
 212    the coding-system of the currently decoding text in advance.  */
 213
 214 /* Decode one ASCII character C.  */
 215
 216 #define DECODE_CHARACTER_ASCII(c)               \
 217   do {                                          \
 218     if (COMPOSING_P (coding->composing))        \
 219       {                                         \
 220         *dst++ = 0xA0, *dst++ = (c) | 0x80;     \
 221         coding->composed_chars++;               \
 222         if (((c) | 0x80) < 0xA0)                \
 223           coding->fake_multibyte = 1;           \
 224       }                                         \
 225     else                                        \
 226       {                                         \
 227         *dst++ = (c);                           \
 228         coding->produced_char++;                \
 229         if ((c) >= 0x80)                        \
 230           coding->fake_multibyte = 1;           \
 231       }                                         \
 232   } while (0)
 233
 234 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 235    position-code is C.  */
 236
 237 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 238   do {                                                                  \
 239     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 240     if (COMPOSING_P (coding->composing))                                \
 241       {                                                                 \
 242         *dst++ = leading_code + 0x20;                                   \
 243         coding->composed_chars++;                                       \
 244       }                                                                 \
 245     else                                                                \
 246       {                                                                 \
 247         *dst++ = leading_code;                                          \
 248         coding->produced_char++;                                        \
 249       }                                                                 \
 250     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 251       *dst++ = leading_code;                                            \
 252     *dst++ = (c) | 0x80;                                                \
 253     if (((c) | 0x80)  < 0xA0)                                           \
 254       coding->fake_multibyte = 1;                                       \
 255   } while (0)
 256
 257 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 258    position-codes are C1 and C2.  */
 259
 260 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 261   do {                                                  \
 262     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 263     *dst++ = (c2) | 0x80;                               \
 264     if (((c2) | 0x80) < 0xA0)                           \
 265       coding->fake_multibyte = 1;                       \
 266   } while (0)
 267
 268 \f
 269 /*** 1. Preamble ***/
 270
 271 #include <stdio.h>
 272
 273 #ifdef emacs
 274
 275 #include <config.h>
 276 #include "lisp.h"
 277 #include "buffer.h"
 278 #include "charset.h"
 279 #include "ccl.h"
 280 #include "coding.h"
 281 #include "window.h"
 282
 283 #else  /* not emacs */
 284
 285 #include "mulelib.h"
 286
 287 #endif /* not emacs */
 288
 289 Lisp_Object Qcoding_system, Qeol_type;
 290 Lisp_Object Qbuffer_file_coding_system;
 291 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 292 Lisp_Object Qno_conversion, Qundecided;
 293 Lisp_Object Qcoding_system_history;
 294 Lisp_Object Qsafe_charsets;
 295 Lisp_Object Qvalid_codes;
 296
 297 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 298 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 299 Lisp_Object Qstart_process, Qopen_network_stream;
 300 Lisp_Object Qtarget_idx;
 301
 302 Lisp_Object Vselect_safe_coding_system_function;
 303
 304 /* Mnemonic string for each format of end-of-line.  */
 305 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 306 /* Mnemonic string to indicate format of end-of-line is not yet
 307    decided.  */
 308 Lisp_Object eol_mnemonic_undecided;
 309
 310 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 311    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 312 int system_eol_type;
 313
 314 #ifdef emacs
 315
 316 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 317
 318 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 319
 320 /* Coding system emacs-mule and raw-text are for converting only
 321    end-of-line format.  */
 322 Lisp_Object Qemacs_mule, Qraw_text;
 323
 324 /* Coding-systems are handed between Emacs Lisp programs and C internal
 325    routines by the following three variables.  */
 326 /* Coding-system for reading files and receiving data from process.  */
 327 Lisp_Object Vcoding_system_for_read;
 328 /* Coding-system for writing files and sending data to process.  */
 329 Lisp_Object Vcoding_system_for_write;
 330 /* Coding-system actually used in the latest I/O.  */
 331 Lisp_Object Vlast_coding_system_used;
 332
 333 /* A vector of length 256 which contains information about special
 334    Latin codes (especially for dealing with Microsoft codes).  */
 335 Lisp_Object Vlatin_extra_code_table;
 336
 337 /* Flag to inhibit code conversion of end-of-line format.  */
 338 int inhibit_eol_conversion;
 339
 340 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 341 int inherit_process_coding_system;
 342
 343 /* Coding system to be used to encode text for terminal display.  */
 344 struct coding_system terminal_coding;
 345
 346 /* Coding system to be used to encode text for terminal display when
 347    terminal coding system is nil.  */
 348 struct coding_system safe_terminal_coding;
 349
 350 /* Coding system of what is sent from terminal keyboard.  */
 351 struct coding_system keyboard_coding;
 352
 353 /* Default coding system to be used to write a file.  */
 354 struct coding_system default_buffer_file_coding;
 355
 356 Lisp_Object Vfile_coding_system_alist;
 357 Lisp_Object Vprocess_coding_system_alist;
 358 Lisp_Object Vnetwork_coding_system_alist;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qcoding_category, Qcoding_category_index;
 363
 364 /* List of symbols `coding-category-xxx' ordered by priority.  */
 365 Lisp_Object Vcoding_category_list;
 366
 367 /* Table of coding categories (Lisp symbols).  */
 368 Lisp_Object Vcoding_category_table;
 369
 370 /* Table of names of symbol for each coding-category.  */
 371 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 372   "coding-category-emacs-mule",
 373   "coding-category-sjis",
 374   "coding-category-iso-7",
 375   "coding-category-iso-7-tight",
 376   "coding-category-iso-8-1",
 377   "coding-category-iso-8-2",
 378   "coding-category-iso-7-else",
 379   "coding-category-iso-8-else",
 380   "coding-category-ccl",
 381   "coding-category-big5",
 382   "coding-category-raw-text",
 383   "coding-category-binary"
 384 };
 385
 386 /* Table of pointers to coding systems corresponding to each coding
 387    categories.  */
 388 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 389
 390 /* Table of coding category masks.  Nth element is a mask for a coding
 391    cateogry of which priority is Nth.  */
 392 static
 393 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 Lisp_Object Vcharset_revision_alist;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 \f
 415 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 416
 417 /* Emacs' internal format for encoding multiple character sets is a
 418    kind of multi-byte encoding, i.e. characters are encoded by
 419    variable-length sequences of one-byte codes.  ASCII characters
 420    and control characters (e.g. `tab', `newline') are represented by
 421    one-byte sequences which are their ASCII codes, in the range 0x00
 422    through 0x7F.  The other characters are represented by a sequence
 423    of `base leading-code', optional `extended leading-code', and one
 424    or two `position-code's.  The length of the sequence is determined
 425    by the base leading-code.  Leading-code takes the range 0x80
 426    through 0x9F, whereas extended leading-code and position-code take
 427    the range 0xA0 through 0xFF.  See `charset.h' for more details
 428    about leading-code and position-code.
 429
 430    There's one exception to this rule.  Special leading-code
 431    `leading-code-composition' denotes that the following several
 432    characters should be composed into one character.  Leading-codes of
 433    components (except for ASCII) are added 0x20.  An ASCII character
 434    component is represented by a 2-byte sequence of `0xA0' and
 435    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 436    details of composite character.  Hence, we can summarize the code
 437    range as follows:
 438
 439    --- CODE RANGE of Emacs' internal format ---
 440    (character set)      (range)
 441    ASCII                0x00 .. 0x7F
 442    ELSE (1st byte)      0x80 .. 0x9F
 443         (rest bytes)    0xA0 .. 0xFF
 444    ---------------------------------------------
 445
 446   */
 447
 448 enum emacs_code_class_type emacs_code_class[256];
 449
 450 /* Go to the next statement only if *SRC is accessible and the code is
 451    greater than 0xA0.  */
 452 #define CHECK_CODE_RANGE_A0_FF  \
 453   do {                          \
 454     if (src >= src_end)         \
 455       goto label_end_of_switch; \
 456     else if (*src++ < 0xA0)     \
 457       return 0;                 \
 458   } while (0)
 459
 460 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 461    Check if a text is encoded in Emacs' internal format.  If it is,
 462    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 463
 464 int
 465 detect_coding_emacs_mule (src, src_end)
 466      unsigned char *src, *src_end;
 467 {
 468   unsigned char c;
 469   int composing = 0;
 470
 471   while (src < src_end)
 472     {
 473       c = *src++;
 474
 475       if (composing)
 476         {
 477           if (c < 0xA0)
 478             composing = 0;
 479           else
 480             c -= 0x20;
 481         }
 482
 483       switch (emacs_code_class[c])
 484         {
 485         case EMACS_ascii_code:
 486         case EMACS_linefeed_code:
 487           break;
 488
 489         case EMACS_control_code:
 490           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 491             return 0;
 492           break;
 493
 494         case EMACS_invalid_code:
 495           return 0;
 496
 497         case EMACS_leading_code_composition: /* c == 0x80 */
 498           if (composing)
 499             CHECK_CODE_RANGE_A0_FF;
 500           else
 501             composing = 1;
 502           break;
 503
 504         case EMACS_leading_code_4:
 505           CHECK_CODE_RANGE_A0_FF;
 506           /* fall down to check it two more times ...  */
 507
 508         case EMACS_leading_code_3:
 509           CHECK_CODE_RANGE_A0_FF;
 510           /* fall down to check it one more time ...  */
 511
 512         case EMACS_leading_code_2:
 513           CHECK_CODE_RANGE_A0_FF;
 514           break;
 515
 516         default:
 517         label_end_of_switch:
 518           break;
 519         }
 520     }
 521   return CODING_CATEGORY_MASK_EMACS_MULE;
 522 }
 523
 524 \f
 525 /*** 3. ISO2022 handlers ***/
 526
 527 /* The following note describes the coding system ISO2022 briefly.
 528    Since the intention of this note is to help in understanding of
 529    the programs in this file, some parts are NOT ACCURATE or OVERLY
 530    SIMPLIFIED.  For the thorough understanding, please refer to the
 531    original document of ISO2022.
 532
 533    ISO2022 provides many mechanisms to encode several character sets
 534    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 535    all text is encoded by codes of less than 128.  This may make the
 536    encoded text a little bit longer, but the text gets more stability
 537    to pass through several gateways (some of them strip off the MSB).
 538
 539    There are two kinds of character set: control character set and
 540    graphic character set.  The former contains control characters such
 541    as `newline' and `escape' to provide control functions (control
 542    functions are provided also by escape sequences).  The latter
 543    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 544    two control character sets and many graphic character sets.
 545
 546    Graphic character sets are classified into one of the following
 547    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 548    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 549    bytes (DIMENSION) and the number of characters in one dimension
 550    (CHARS) of the set.  In addition, each character set is assigned an
 551    identification tag (called "final character" and denoted as <F>
 552    here after) which is unique in each class.  <F> of each character
 553    set is decided by ECMA(*) when it is registered in ISO.  Code range
 554    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 555
 556    Note (*): ECMA = European Computer Manufacturers Association
 557
 558    Here are examples of graphic character set [NAME(<F>)]:
 559         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 560         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 561         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 562         o DIMENSION2_CHARS96 -- none for the moment
 563
 564    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 565         C0 [0x00..0x1F] -- control character plane 0
 566         GL [0x20..0x7F] -- graphic character plane 0
 567         C1 [0x80..0x9F] -- control character plane 1
 568         GR [0xA0..0xFF] -- graphic character plane 1
 569
 570    A control character set is directly designated and invoked to C0 or
 571    C1 by an escape sequence.  The most common case is that ISO646's
 572    control character set is designated/invoked to C0 and ISO6429's
 573    control character set is designated/invoked to C1, and usually
 574    these designations/invocations are omitted in a coded text.  With
 575    7-bit environment, only C0 can be used, and a control character for
 576    C1 is encoded by an appropriate escape sequence to fit in the
 577    environment.  All control characters for C1 are defined the
 578    corresponding escape sequences.
 579
 580    A graphic character set is at first designated to one of four
 581    graphic registers (G0 through G3), then these graphic registers are
 582    invoked to GL or GR.  These designations and invocations can be
 583    done independently.  The most common case is that G0 is invoked to
 584    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 585    these invocations and designations are omitted in a coded text.
 586    With 7-bit environment, only GL can be used.
 587
 588    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 589    and 0x7F of GL area work as control characters SPACE and DEL
 590    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 591
 592    There are two ways of invocation: locking-shift and single-shift.
 593    With locking-shift, the invocation lasts until the next different
 594    invocation, whereas with single-shift, the invocation works only
 595    for the following character and doesn't affect locking-shift.
 596    Invocations are done by the following control characters or escape
 597    sequences.
 598
 599    ----------------------------------------------------------------------
 600    function             control char    escape sequence description
 601    ----------------------------------------------------------------------
 602    SI  (shift-in)               0x0F    none            invoke G0 to GL
 603    SO  (shift-out)              0x0E    none            invoke G1 to GL
 604    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 605    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 606    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 607    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 608    ----------------------------------------------------------------------
 609    The first four are for locking-shift.  Control characters for these
 610    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 611
 612    Designations are done by the following escape sequences.
 613    ----------------------------------------------------------------------
 614    escape sequence      description
 615    ----------------------------------------------------------------------
 616    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 617    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 618    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 619    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 620    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 621    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 622    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 623    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 624    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 625    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 626    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 627    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 628    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 629    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 630    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 631    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 632    ----------------------------------------------------------------------
 633
 634    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 635    of dimension 1, chars 94, and final character <F>, and etc.
 636
 637    Note (*): Although these designations are not allowed in ISO2022,
 638    Emacs accepts them on decoding, and produces them on encoding
 639    CHARS96 character set in a coding system which is characterized as
 640    7-bit environment, non-locking-shift, and non-single-shift.
 641
 642    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 643    '(' can be omitted.  We call this as "short-form" here after.
 644
 645    Now you may notice that there are a lot of ways for encoding the
 646    same multilingual text in ISO2022.  Actually, there exists many
 647    coding systems such as Compound Text (used in X's inter client
 648    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 649    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 650    localized platforms), and all of these are variants of ISO2022.
 651
 652    In addition to the above, Emacs handles two more kinds of escape
 653    sequences: ISO6429's direction specification and Emacs' private
 654    sequence for specifying character composition.
 655
 656    ISO6429's direction specification takes the following format:
 657         o CSI ']'      -- end of the current direction
 658         o CSI '0' ']'  -- end of the current direction
 659         o CSI '1' ']'  -- start of left-to-right text
 660         o CSI '2' ']'  -- start of right-to-left text
 661    The control character CSI (0x9B: control sequence introducer) is
 662    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 663
 664    Character composition specification takes the following format:
 665         o ESC '0' -- start character composition
 666         o ESC '1' -- end character composition
 667    Since these are not standard escape sequences of any ISO, the use
 668    of them for these meaning is restricted to Emacs only.  */
 669
 670 enum iso_code_class_type iso_code_class[256];
 671
 672 #define CHARSET_OK(idx, charset)                                \
 673   (coding_system_table[idx]                                     \
 674    && (coding_system_table[idx]->safe_charsets[charset]         \
 675        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 676             (coding_system_table[idx], charset)                 \
 677            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 678
 679 #define SHIFT_OUT_OK(idx) \
 680   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 681
 682 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 683    Check if a text is encoded in ISO2022.  If it is, returns an
 684    integer in which appropriate flag bits any of:
 685         CODING_CATEGORY_MASK_ISO_7
 686         CODING_CATEGORY_MASK_ISO_7_TIGHT
 687         CODING_CATEGORY_MASK_ISO_8_1
 688         CODING_CATEGORY_MASK_ISO_8_2
 689         CODING_CATEGORY_MASK_ISO_7_ELSE
 690         CODING_CATEGORY_MASK_ISO_8_ELSE
 691    are set.  If a code which should never appear in ISO2022 is found,
 692    returns 0.  */
 693
 694 int
 695 detect_coding_iso2022 (src, src_end)
 696      unsigned char *src, *src_end;
 697 {
 698   int mask = CODING_CATEGORY_MASK_ISO;
 699   int mask_found = 0;
 700   int reg[4], shift_out = 0, single_shifting = 0;
 701   int c, c1, i, charset;
 702
 703   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 704   while (mask && src < src_end)
 705     {
 706       c = *src++;
 707       switch (c)
 708         {
 709         case ISO_CODE_ESC:
 710           single_shifting = 0;
 711           if (src >= src_end)
 712             break;
 713           c = *src++;
 714           if (c >= '(' && c <= '/')
 715             {
 716               /* Designation sequence for a charset of dimension 1.  */
 717               if (src >= src_end)
 718                 break;
 719               c1 = *src++;
 720               if (c1 < ' ' || c1 >= 0x80
 721                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 722                 /* Invalid designation sequence.  Just ignore.  */
 723                 break;
 724               reg[(c - '(') % 4] = charset;
 725             }
 726           else if (c == '$')
 727             {
 728               /* Designation sequence for a charset of dimension 2.  */
 729               if (src >= src_end)
 730                 break;
 731               c = *src++;
 732               if (c >= '@' && c <= 'B')
 733                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 734                 reg[0] = charset = iso_charset_table[1][0][c];
 735               else if (c >= '(' && c <= '/')
 736                 {
 737                   if (src >= src_end)
 738                     break;
 739                   c1 = *src++;
 740                   if (c1 < ' ' || c1 >= 0x80
 741                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 742                     /* Invalid designation sequence.  Just ignore.  */
 743                     break;
 744                   reg[(c - '(') % 4] = charset;
 745                 }
 746               else
 747                 /* Invalid designation sequence.  Just ignore.  */
 748                 break;
 749             }
 750           else if (c == 'N' || c == 'O')
 751             {
 752               /* ESC <Fe> for SS2 or SS3.  */
 753               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 754               break;
 755             }
 756           else if (c == '0' || c == '1' || c == '2')
 757             /* ESC <Fp> for start/end composition.  Just ignore.  */
 758             break;
 759           else
 760             /* Invalid escape sequence.  Just ignore.  */
 761             break;
 762
 763           /* We found a valid designation sequence for CHARSET.  */
 764           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 765           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 766             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 767           else
 768             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 769           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 770             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 771           else
 772             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 773           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 774             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 775           else
 776             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 777           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 778             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 779           else
 780             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 781           break;
 782
 783         case ISO_CODE_SO:
 784           single_shifting = 0;
 785           if (shift_out == 0
 786               && (reg[1] >= 0
 787                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 788                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 789             {
 790               /* Locking shift out.  */
 791               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 792               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 793             }
 794           break;
 795
 796         case ISO_CODE_SI:
 797           single_shifting = 0;
 798           if (shift_out == 1)
 799             {
 800               /* Locking shift in.  */
 801               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 802               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 803             }
 804           break;
 805
 806         case ISO_CODE_CSI:
 807           single_shifting = 0;
 808         case ISO_CODE_SS2:
 809         case ISO_CODE_SS3:
 810           {
 811             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 812
 813             if (c != ISO_CODE_CSI)
 814               {
 815                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 816                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 817                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 818                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 819                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 820                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 821                 single_shifting = 1;
 822               }
 823             if (VECTORP (Vlatin_extra_code_table)
 824                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 825               {
 826                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 827                     & CODING_FLAG_ISO_LATIN_EXTRA)
 828                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 829                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 830                     & CODING_FLAG_ISO_LATIN_EXTRA)
 831                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 832               }
 833             mask &= newmask;
 834             mask_found |= newmask;
 835           }
 836           break;
 837
 838         default:
 839           if (c < 0x80)
 840             {
 841               single_shifting = 0;
 842               break;
 843             }
 844           else if (c < 0xA0)
 845             {
 846               single_shifting = 0;
 847               if (VECTORP (Vlatin_extra_code_table)
 848                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 849                 {
 850                   int newmask = 0;
 851
 852                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 853                       & CODING_FLAG_ISO_LATIN_EXTRA)
 854                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 855                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 856                       & CODING_FLAG_ISO_LATIN_EXTRA)
 857                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 858                   mask &= newmask;
 859                   mask_found |= newmask;
 860                 }
 861               else
 862                 return 0;
 863             }
 864           else
 865             {
 866               unsigned char *src_begin = src;
 867
 868               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 869                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 870               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 871               /* Check the length of succeeding codes of the range
 872                  0xA0..0FF.  If the byte length is odd, we exclude
 873                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
 874                  when we are not single shifting.  */
 875               if (!single_shifting)
 876                 {
 877                   while (src < src_end && *src >= 0xA0)
 878                     src++;
 879                   if ((src - src_begin - 1) & 1 && src < src_end)
 880                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 881                   else
 882                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
 883                 }
 884             }
 885           break;
 886         }
 887     }
 888
 889   return (mask & mask_found);
 890 }
 891
 892 /* Decode a character of which charset is CHARSET and the 1st position
 893    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 894    fetched from SRC and set to C2.  If CHARSET is negative, it means
 895    that we are decoding ill formed text, and what we can do is just to
 896    read C1 as is.  */
 897
 898 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 899   do {                                                                  \
 900     int c_alt, charset_alt = (charset);                                 \
 901     if (COMPOSING_HEAD_P (coding->composing))                           \
 902       {                                                                 \
 903         *dst++ = LEADING_CODE_COMPOSITION;                              \
 904         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 905           /* To tell composition rules are embeded.  */                 \
 906           *dst++ = 0xFF;                                                \
 907         coding->composing += 2;                                         \
 908       }                                                                 \
 909     if (charset_alt >= 0)                                               \
 910       {                                                                 \
 911         if (CHARSET_DIMENSION (charset_alt) == 2)                       \
 912           {                                                             \
 913             ONE_MORE_BYTE (c2);                                         \
 914             if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F         \
 915                 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0)  \
 916               {                                                         \
 917                 src--;                                                  \
 918                 charset_alt = CHARSET_ASCII;                            \
 919               }                                                         \
 920           }                                                             \
 921         if (!NILP (translation_table)                                   \
 922             && ((c_alt = translate_char (translation_table,             \
 923                                          -1, charset_alt, c1, c2)) >= 0)) \
 924           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 925       }                                                                 \
 926     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 927       DECODE_CHARACTER_ASCII (c1);                                      \
 928     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 929       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 930     else                                                                \
 931       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 932     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 933       /* To tell a composition rule follows.  */                        \
 934       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 935   } while (0)
 936
 937 /* Set designation state into CODING.  */
 938 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
 939   do {                                                                     \
 940     int charset;                                                           \
 941                                                                            \
 942     if (final_char < '0' || final_char >= 128)                             \
 943       goto label_invalid_code;                                             \
 944     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
 945                                  make_number (chars),                      \
 946                                  make_number (final_char));                \
 947     if (charset >= 0                                                       \
 948         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
 949             || coding->safe_charsets[charset]))                            \
 950       {                                                                    \
 951         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
 952             && reg == 0                                                    \
 953             && charset == CHARSET_ASCII)                                   \
 954           {                                                                \
 955             /* We should insert this designation sequence as is so         \
 956                that it is surely written back to a file.  */               \
 957             coding->spec.iso2022.last_invalid_designation_register = -1;   \
 958             goto label_invalid_code;                                       \
 959           }                                                                \
 960         coding->spec.iso2022.last_invalid_designation_register = -1;       \
 961         if ((coding->mode & CODING_MODE_DIRECTION)                         \
 962             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
 963           charset = CHARSET_REVERSE_CHARSET (charset);                     \
 964         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
 965       }                                                                    \
 966     else                                                                   \
 967       {                                                                    \
 968         coding->spec.iso2022.last_invalid_designation_register = reg;      \
 969         goto label_invalid_code;                                           \
 970       }                                                                    \
 971   } while (0)
 972
 973 /* Return 0 if there's a valid composing sequence starting at SRC and
 974    ending before SRC_END, else return -1.  */
 975
 976 int
 977 check_composing_code (coding, src, src_end)
 978      struct coding_system *coding;
 979      unsigned char *src, *src_end;
 980 {
 981   int charset, c, c1, dim;
 982
 983   while (src < src_end)
 984     {
 985       c = *src++;
 986       if (c >= 0x20)
 987         continue;
 988       if (c != ISO_CODE_ESC || src >= src_end)
 989         return -1;
 990       c = *src++;
 991       if (c == '1') /* end of compsition */
 992         return 0;
 993       if (src + 2 >= src_end
 994           || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
 995         return -1;
 996
 997       dim = (c == '$');
 998       if (dim == 1)
 999         c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1000       if (c >= '(' && c <= '/')
1001         {
1002           c1 = *src++;
1003           if ((c1 < ' ' || c1 >= 0x80)
1004               || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1005               || ! coding->safe_charsets[charset]
1006               || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1007                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1008             return -1;
1009         }
1010       else
1011         return -1;
1012     }
1013
1014   /* We have not found the sequence "ESC 1".  */
1015   return -1;
1016 }
1017
1018 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1019
1020 int
1021 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1022      struct coding_system *coding;
1023      unsigned char *source, *destination;
1024      int src_bytes, dst_bytes;
1025 {
1026   unsigned char *src = source;
1027   unsigned char *src_end = source + src_bytes;
1028   unsigned char *dst = destination;
1029   unsigned char *dst_end = destination + dst_bytes;
1030   /* Since the maximum bytes produced by each loop is 7, we subtract 6
1031      from DST_END to assure that overflow checking is necessary only
1032      at the head of loop.  */
1033   unsigned char *adjusted_dst_end = dst_end - 6;
1034   int charset;
1035   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1036   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1037   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1038   Lisp_Object translation_table
1039     = coding->translation_table_for_decode;
1040   int result = CODING_FINISH_NORMAL;
1041
1042   if (!NILP (Venable_character_translation) && NILP (translation_table))
1043     translation_table = Vstandard_translation_table_for_decode;
1044
1045   coding->produced_char = 0;
1046   coding->composed_chars = 0;
1047   coding->fake_multibyte = 0;
1048   while (src < src_end && (dst_bytes
1049                            ? (dst < adjusted_dst_end)
1050                            : (dst < src - 6)))
1051     {
1052       /* SRC_BASE remembers the start position in source in each loop.
1053          The loop will be exited when there's not enough source text
1054          to analyze long escape sequence or 2-byte code (within macros
1055          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
1056          to SRC_BASE before exiting.  */
1057       unsigned char *src_base = src;
1058       int c1 = *src++, c2;
1059
1060       switch (iso_code_class [c1])
1061         {
1062         case ISO_0x20_or_0x7F:
1063           if (!coding->composing
1064               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1065             {
1066               /* This is SPACE or DEL.  */
1067               *dst++ = c1;
1068               coding->produced_char++;
1069               break;
1070             }
1071           /* This is a graphic character, we fall down ...  */
1072
1073         case ISO_graphic_plane_0:
1074           if (coding->composing == COMPOSING_WITH_RULE_RULE)
1075             {
1076               /* This is a composition rule.  */
1077               *dst++ = c1 | 0x80;
1078               coding->composing = COMPOSING_WITH_RULE_TAIL;
1079             }
1080           else
1081             DECODE_ISO_CHARACTER (charset0, c1);
1082           break;
1083
1084         case ISO_0xA0_or_0xFF:
1085           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1086               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1087             goto label_invalid_code;
1088           /* This is a graphic character, we fall down ... */
1089
1090         case ISO_graphic_plane_1:
1091           if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1092             goto label_invalid_code;
1093           else
1094             DECODE_ISO_CHARACTER (charset1, c1);
1095           break;
1096
1097         case ISO_control_code:
1098           /* All ISO2022 control characters in this class have the
1099              same representation in Emacs internal format.  */
1100           if (c1 == '\n'
1101               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1102               && (coding->eol_type == CODING_EOL_CR
1103                   || coding->eol_type == CODING_EOL_CRLF))
1104             {
1105               result = CODING_FINISH_INCONSISTENT_EOL;
1106               goto label_end_of_loop_2;
1107             }
1108           *dst++ = c1;
1109           coding->produced_char++;
1110           if (c1 >= 0x80)
1111             coding->fake_multibyte = 1;
1112           break;
1113
1114         case ISO_carriage_return:
1115           if (coding->eol_type == CODING_EOL_CR)
1116             *dst++ = '\n';
1117           else if (coding->eol_type == CODING_EOL_CRLF)
1118             {
1119               ONE_MORE_BYTE (c1);
1120               if (c1 == ISO_CODE_LF)
1121                 *dst++ = '\n';
1122               else
1123                 {
1124                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1125                     {
1126                       result = CODING_FINISH_INCONSISTENT_EOL;
1127                       goto label_end_of_loop_2;
1128                     }
1129                   src--;
1130                   *dst++ = '\r';
1131                 }
1132             }
1133           else
1134             *dst++ = c1;
1135           coding->produced_char++;
1136           break;
1137
1138         case ISO_shift_out:
1139           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1140               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1141             goto label_invalid_code;
1142           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1143           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1144           break;
1145
1146         case ISO_shift_in:
1147           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1148             goto label_invalid_code;
1149           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1150           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1151           break;
1152
1153         case ISO_single_shift_2_7:
1154         case ISO_single_shift_2:
1155           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1156             goto label_invalid_code;
1157           /* SS2 is handled as an escape sequence of ESC 'N' */
1158           c1 = 'N';
1159           goto label_escape_sequence;
1160
1161         case ISO_single_shift_3:
1162           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1163             goto label_invalid_code;
1164           /* SS2 is handled as an escape sequence of ESC 'O' */
1165           c1 = 'O';
1166           goto label_escape_sequence;
1167
1168         case ISO_control_sequence_introducer:
1169           /* CSI is handled as an escape sequence of ESC '[' ...  */
1170           c1 = '[';
1171           goto label_escape_sequence;
1172
1173         case ISO_escape:
1174           ONE_MORE_BYTE (c1);
1175         label_escape_sequence:
1176           /* Escape sequences handled by Emacs are invocation,
1177              designation, direction specification, and character
1178              composition specification.  */
1179           switch (c1)
1180             {
1181             case '&':           /* revision of following character set */
1182               ONE_MORE_BYTE (c1);
1183               if (!(c1 >= '@' && c1 <= '~'))
1184                 goto label_invalid_code;
1185               ONE_MORE_BYTE (c1);
1186               if (c1 != ISO_CODE_ESC)
1187                 goto label_invalid_code;
1188               ONE_MORE_BYTE (c1);
1189               goto label_escape_sequence;
1190
1191             case '$':           /* designation of 2-byte character set */
1192               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1193                 goto label_invalid_code;
1194               ONE_MORE_BYTE (c1);
1195               if (c1 >= '@' && c1 <= 'B')
1196                 {       /* designation of JISX0208.1978, GB2312.1980,
1197                            or JISX0208.1980 */
1198                   DECODE_DESIGNATION (0, 2, 94, c1);
1199                 }
1200               else if (c1 >= 0x28 && c1 <= 0x2B)
1201                 {       /* designation of DIMENSION2_CHARS94 character set */
1202                   ONE_MORE_BYTE (c2);
1203                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1204                 }
1205               else if (c1 >= 0x2C && c1 <= 0x2F)
1206                 {       /* designation of DIMENSION2_CHARS96 character set */
1207                   ONE_MORE_BYTE (c2);
1208                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1209                 }
1210               else
1211                 goto label_invalid_code;
1212               break;
1213
1214             case 'n':           /* invocation of locking-shift-2 */
1215               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1216                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1217                 goto label_invalid_code;
1218               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1219               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1220               break;
1221
1222             case 'o':           /* invocation of locking-shift-3 */
1223               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1224                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1225                 goto label_invalid_code;
1226               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1227               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1228               break;
1229
1230             case 'N':           /* invocation of single-shift-2 */
1231               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1232                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1233                 goto label_invalid_code;
1234               ONE_MORE_BYTE (c1);
1235               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1236               DECODE_ISO_CHARACTER (charset, c1);
1237               break;
1238
1239             case 'O':           /* invocation of single-shift-3 */
1240               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1241                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1242                 goto label_invalid_code;
1243               ONE_MORE_BYTE (c1);
1244               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1245               DECODE_ISO_CHARACTER (charset, c1);
1246               break;
1247
1248             case '0': case '2': /* start composing */
1249               /* Before processing composing, we must be sure that all
1250                  characters being composed are supported by CODING.
1251                  If not, we must give up composing.  */
1252               if (check_composing_code (coding, src, src_end) == 0)
1253                 {
1254                   /* We are looking at a valid composition sequence.  */
1255                   coding->composing = (c1 == '0'
1256                                        ? COMPOSING_NO_RULE_HEAD
1257                                        : COMPOSING_WITH_RULE_HEAD);
1258                   coding->composed_chars = 0;
1259                 }
1260               else
1261                 {
1262                   *dst++ = ISO_CODE_ESC;
1263                   *dst++ = c1;
1264                   coding->produced_char += 2;
1265                 }
1266               break;
1267
1268             case '1':           /* end composing */
1269               if (!coding->composing)
1270                 {
1271                   *dst++ = ISO_CODE_ESC;
1272                   *dst++ = c1;
1273                   coding->produced_char += 2;
1274                   break;
1275                 }
1276
1277               if (coding->composed_chars > 0)
1278                 {
1279                   if (coding->composed_chars == 1)
1280                     {
1281                       unsigned char *this_char_start = dst;
1282                       int this_bytes;
1283
1284                       /* Only one character is in the composing
1285                          sequence.  Make it a normal character.  */
1286                       while (*--this_char_start != LEADING_CODE_COMPOSITION);
1287                       dst = (this_char_start
1288                              + (coding->composing == COMPOSING_NO_RULE_TAIL
1289                                 ? 1 : 2));
1290                       *dst -= 0x20;
1291                       if (*dst == 0x80)
1292                         *++dst &= 0x7F;
1293                       this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1294                       while (this_bytes--) *this_char_start++ = *dst++;
1295                       dst = this_char_start;
1296                     }
1297                   coding->produced_char++;
1298                 }
1299               coding->composing = COMPOSING_NO;
1300               break;
1301
1302             case '[':           /* specification of direction */
1303               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1304                 goto label_invalid_code;
1305               /* For the moment, nested direction is not supported.
1306                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1307                  left-to-right, and nozero means right-to-left.  */
1308               ONE_MORE_BYTE (c1);
1309               switch (c1)
1310                 {
1311                 case ']':       /* end of the current direction */
1312                   coding->mode &= ~CODING_MODE_DIRECTION;
1313
1314                 case '0':       /* end of the current direction */
1315                 case '1':       /* start of left-to-right direction */
1316                   ONE_MORE_BYTE (c1);
1317                   if (c1 == ']')
1318                     coding->mode &= ~CODING_MODE_DIRECTION;
1319                   else
1320                     goto label_invalid_code;
1321                   break;
1322
1323                 case '2':       /* start of right-to-left direction */
1324                   ONE_MORE_BYTE (c1);
1325                   if (c1 == ']')
1326                     coding->mode |= CODING_MODE_DIRECTION;
1327                   else
1328                     goto label_invalid_code;
1329                   break;
1330
1331                 default:
1332                   goto label_invalid_code;
1333                 }
1334               break;
1335
1336             default:
1337               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1338                 goto label_invalid_code;
1339               if (c1 >= 0x28 && c1 <= 0x2B)
1340                 {       /* designation of DIMENSION1_CHARS94 character set */
1341                   ONE_MORE_BYTE (c2);
1342                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1343                 }
1344               else if (c1 >= 0x2C && c1 <= 0x2F)
1345                 {       /* designation of DIMENSION1_CHARS96 character set */
1346                   ONE_MORE_BYTE (c2);
1347                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1348                 }
1349               else
1350                 {
1351                   goto label_invalid_code;
1352                 }
1353             }
1354           /* We must update these variables now.  */
1355           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1356           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1357           break;
1358
1359         label_invalid_code:
1360           while (src_base < src)
1361             *dst++ = *src_base++;
1362           coding->fake_multibyte = 1;
1363         }
1364       continue;
1365
1366     label_end_of_loop:
1367       result = CODING_FINISH_INSUFFICIENT_SRC;
1368     label_end_of_loop_2:
1369       src = src_base;
1370       break;
1371     }
1372
1373   if (src < src_end)
1374     {
1375       if (result == CODING_FINISH_NORMAL)
1376         result = CODING_FINISH_INSUFFICIENT_DST;
1377       else if (result != CODING_FINISH_INCONSISTENT_EOL
1378                && coding->mode & CODING_MODE_LAST_BLOCK)
1379         {
1380           /* This is the last block of the text to be decoded.  We had
1381              better just flush out all remaining codes in the text
1382              although they are not valid characters.  */
1383           src_bytes = src_end - src;
1384           if (dst_bytes && (dst_end - dst < src_bytes))
1385             src_bytes = dst_end - dst;
1386           bcopy (src, dst, src_bytes);
1387           dst += src_bytes;
1388           src += src_bytes;
1389           coding->fake_multibyte = 1;
1390         }
1391     }
1392
1393   coding->consumed = coding->consumed_char = src - source;
1394   coding->produced = dst - destination;
1395   return result;
1396 }
1397
1398 /* ISO2022 encoding stuff.  */
1399
1400 /*
1401    It is not enough to say just "ISO2022" on encoding, we have to
1402    specify more details.  In Emacs, each coding system of ISO2022
1403    variant has the following specifications:
1404         1. Initial designation to G0 thru G3.
1405         2. Allows short-form designation?
1406         3. ASCII should be designated to G0 before control characters?
1407         4. ASCII should be designated to G0 at end of line?
1408         5. 7-bit environment or 8-bit environment?
1409         6. Use locking-shift?
1410         7. Use Single-shift?
1411    And the following two are only for Japanese:
1412         8. Use ASCII in place of JIS0201-1976-Roman?
1413         9. Use JISX0208-1983 in place of JISX0208-1978?
1414    These specifications are encoded in `coding->flags' as flag bits
1415    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1416    details.
1417 */
1418
1419 /* Produce codes (escape sequence) for designating CHARSET to graphic
1420    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1421    the coding system CODING allows, produce designation sequence of
1422    short-form.  */
1423
1424 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1425   do {                                                                  \
1426     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1427     char *intermediate_char_94 = "()*+";                                \
1428     char *intermediate_char_96 = ",-./";                                \
1429     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1430     if (revision < 255)                                                 \
1431       {                                                                 \
1432         *dst++ = ISO_CODE_ESC;                                          \
1433         *dst++ = '&';                                                   \
1434         *dst++ = '@' + revision;                                        \
1435       }                                                                 \
1436     *dst++ = ISO_CODE_ESC;                                              \
1437     if (CHARSET_DIMENSION (charset) == 1)                               \
1438       {                                                                 \
1439         if (CHARSET_CHARS (charset) == 94)                              \
1440           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1441         else                                                            \
1442           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1443       }                                                                 \
1444     else                                                                \
1445       {                                                                 \
1446         *dst++ = '$';                                                   \
1447         if (CHARSET_CHARS (charset) == 94)                              \
1448           {                                                             \
1449             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1450                 || reg != 0                                             \
1451                 || final_char < '@' || final_char > 'B')                \
1452               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1453           }                                                             \
1454         else                                                            \
1455           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1456       }                                                                 \
1457     *dst++ = final_char;                                                \
1458     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1459   } while (0)
1460
1461 /* The following two macros produce codes (control character or escape
1462    sequence) for ISO2022 single-shift functions (single-shift-2 and
1463    single-shift-3).  */
1464
1465 #define ENCODE_SINGLE_SHIFT_2                           \
1466   do {                                                  \
1467     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1468       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1469     else                                                \
1470       {                                                 \
1471         *dst++ = ISO_CODE_SS2;                          \
1472         coding->fake_multibyte = 1;                     \
1473       }                                                 \
1474     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1475   } while (0)
1476
1477 #define ENCODE_SINGLE_SHIFT_3                           \
1478   do {                                                  \
1479     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1480       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1481     else                                                \
1482       {                                                 \
1483         *dst++ = ISO_CODE_SS3;                          \
1484         coding->fake_multibyte = 1;                     \
1485       }                                                 \
1486     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1487   } while (0)
1488
1489 /* The following four macros produce codes (control character or
1490    escape sequence) for ISO2022 locking-shift functions (shift-in,
1491    shift-out, locking-shift-2, and locking-shift-3).  */
1492
1493 #define ENCODE_SHIFT_IN                         \
1494   do {                                          \
1495     *dst++ = ISO_CODE_SI;                       \
1496     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1497   } while (0)
1498
1499 #define ENCODE_SHIFT_OUT                        \
1500   do {                                          \
1501     *dst++ = ISO_CODE_SO;                       \
1502     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1503   } while (0)
1504
1505 #define ENCODE_LOCKING_SHIFT_2                  \
1506   do {                                          \
1507     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1508     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1509   } while (0)
1510
1511 #define ENCODE_LOCKING_SHIFT_3                  \
1512   do {                                          \
1513     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1514     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1515   } while (0)
1516
1517 /* Produce codes for a DIMENSION1 character whose character set is
1518    CHARSET and whose position-code is C1.  Designation and invocation
1519    sequences are also produced in advance if necessary.  */
1520
1521
1522 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1523   do {                                                                  \
1524     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1525       {                                                                 \
1526         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1527           *dst++ = c1 & 0x7F;                                           \
1528         else                                                            \
1529           *dst++ = c1 | 0x80;                                           \
1530         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1531         break;                                                          \
1532       }                                                                 \
1533     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1534       {                                                                 \
1535         *dst++ = c1 & 0x7F;                                             \
1536         break;                                                          \
1537       }                                                                 \
1538     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1539       {                                                                 \
1540         *dst++ = c1 | 0x80;                                             \
1541         break;                                                          \
1542       }                                                                 \
1543     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1544              && !coding->safe_charsets[charset])                        \
1545       {                                                                 \
1546         /* We should not encode this character, instead produce one or  \
1547            two `?'s.  */                                                \
1548         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1549         if (CHARSET_WIDTH (charset) == 2)                               \
1550           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1551         break;                                                          \
1552       }                                                                 \
1553     else                                                                \
1554       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1555          must invoke it, or, at first, designate it to some graphic     \
1556          register.  Then repeat the loop to actually produce the        \
1557          character.  */                                                 \
1558       dst = encode_invocation_designation (charset, coding, dst);       \
1559   } while (1)
1560
1561 /* Produce codes for a DIMENSION2 character whose character set is
1562    CHARSET and whose position-codes are C1 and C2.  Designation and
1563    invocation codes are also produced in advance if necessary.  */
1564
1565 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1566   do {                                                                  \
1567     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1568       {                                                                 \
1569         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1570           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1571         else                                                            \
1572           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1573         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1574         break;                                                          \
1575       }                                                                 \
1576     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1577       {                                                                 \
1578         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1579         break;                                                          \
1580       }                                                                 \
1581     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1582       {                                                                 \
1583         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1584         break;                                                          \
1585       }                                                                 \
1586     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1587              && !coding->safe_charsets[charset])                        \
1588       {                                                                 \
1589         /* We should not encode this character, instead produce one or  \
1590            two `?'s.  */                                                \
1591         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1592         if (CHARSET_WIDTH (charset) == 2)                               \
1593           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1594         break;                                                          \
1595       }                                                                 \
1596     else                                                                \
1597       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1598          must invoke it, or, at first, designate it to some graphic     \
1599          register.  Then repeat the loop to actually produce the        \
1600          character.  */                                                 \
1601       dst = encode_invocation_designation (charset, coding, dst);       \
1602   } while (1)
1603
1604 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                   \
1605   do {                                                          \
1606     int c_alt, charset_alt;                                     \
1607     if (!NILP (translation_table)                               \
1608         && ((c_alt = translate_char (translation_table, -1,     \
1609                                      charset, c1, c2))          \
1610             >= 0))                                              \
1611       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                  \
1612     else                                                        \
1613       charset_alt = charset;                                    \
1614     if (CHARSET_DIMENSION (charset_alt) == 1)                   \
1615       {                                                         \
1616         if (charset == CHARSET_ASCII                            \
1617             && coding->flags & CODING_FLAG_ISO_USE_ROMAN)       \
1618           charset_alt = charset_latin_jisx0201;                 \
1619         ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);      \
1620       }                                                         \
1621     else                                                        \
1622       {                                                         \
1623         if (charset == charset_jisx0208                         \
1624             && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)      \
1625           charset_alt = charset_jisx0208_1978;                  \
1626         ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);  \
1627       }                                                         \
1628     if (! COMPOSING_P (coding->composing))                      \
1629       coding->consumed_char++;                                  \
1630   } while (0)
1631
1632 /* Produce designation and invocation codes at a place pointed by DST
1633    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1634    Return new DST.  */
1635
1636 unsigned char *
1637 encode_invocation_designation (charset, coding, dst)
1638      int charset;
1639      struct coding_system *coding;
1640      unsigned char *dst;
1641 {
1642   int reg;                      /* graphic register number */
1643
1644   /* At first, check designations.  */
1645   for (reg = 0; reg < 4; reg++)
1646     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1647       break;
1648
1649   if (reg >= 4)
1650     {
1651       /* CHARSET is not yet designated to any graphic registers.  */
1652       /* At first check the requested designation.  */
1653       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1654       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1655         /* Since CHARSET requests no special designation, designate it
1656            to graphic register 0.  */
1657         reg = 0;
1658
1659       ENCODE_DESIGNATION (charset, reg, coding);
1660     }
1661
1662   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1663       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1664     {
1665       /* Since the graphic register REG is not invoked to any graphic
1666          planes, invoke it to graphic plane 0.  */
1667       switch (reg)
1668         {
1669         case 0:                 /* graphic register 0 */
1670           ENCODE_SHIFT_IN;
1671           break;
1672
1673         case 1:                 /* graphic register 1 */
1674           ENCODE_SHIFT_OUT;
1675           break;
1676
1677         case 2:                 /* graphic register 2 */
1678           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1679             ENCODE_SINGLE_SHIFT_2;
1680           else
1681             ENCODE_LOCKING_SHIFT_2;
1682           break;
1683
1684         case 3:                 /* graphic register 3 */
1685           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1686             ENCODE_SINGLE_SHIFT_3;
1687           else
1688             ENCODE_LOCKING_SHIFT_3;
1689           break;
1690         }
1691     }
1692   return dst;
1693 }
1694
1695 /* The following two macros produce codes for indicating composition.  */
1696 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1697 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1698 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1699
1700 /* The following three macros produce codes for indicating direction
1701    of text.  */
1702 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1703   do {                                                  \
1704     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1705       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1706     else                                                \
1707       *dst++ = ISO_CODE_CSI;                            \
1708   } while (0)
1709
1710 #define ENCODE_DIRECTION_R2L    \
1711   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1712
1713 #define ENCODE_DIRECTION_L2R    \
1714   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1715
1716 /* Produce codes for designation and invocation to reset the graphic
1717    planes and registers to initial state.  */
1718 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1719   do {                                                                      \
1720     int reg;                                                                \
1721     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1722       ENCODE_SHIFT_IN;                                                      \
1723     for (reg = 0; reg < 4; reg++)                                           \
1724       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1725           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1726               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1727         ENCODE_DESIGNATION                                                  \
1728           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1729   } while (0)
1730
1731 /* Produce designation sequences of charsets in the line started from
1732    SRC to a place pointed by *DSTP, and update DSTP.
1733
1734    If the current block ends before any end-of-line, we may fail to
1735    find all the necessary designations.  */
1736
1737 void
1738 encode_designation_at_bol (coding, table, src, src_end, dstp)
1739      struct coding_system *coding;
1740      Lisp_Object table;
1741      unsigned char *src, *src_end, **dstp;
1742 {
1743   int charset, c, found = 0, reg;
1744   /* Table of charsets to be designated to each graphic register.  */
1745   int r[4];
1746   unsigned char *dst = *dstp;
1747
1748   for (reg = 0; reg < 4; reg++)
1749     r[reg] = -1;
1750
1751   while (src < src_end && *src != '\n' && found < 4)
1752     {
1753       int bytes = BYTES_BY_CHAR_HEAD (*src);
1754
1755       if (NILP (table))
1756         charset = CHARSET_AT (src);
1757       else
1758         {
1759           int c_alt;
1760           unsigned char c1, c2;
1761
1762           SPLIT_STRING(src, bytes, charset, c1, c2);
1763           if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
1764             charset = CHAR_CHARSET (c_alt);
1765         }
1766
1767       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1768       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1769         {
1770           found++;
1771           r[reg] = charset;
1772         }
1773
1774       src += bytes;
1775     }
1776
1777   if (found)
1778     {
1779       for (reg = 0; reg < 4; reg++)
1780         if (r[reg] >= 0
1781             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1782           ENCODE_DESIGNATION (r[reg], reg, coding);
1783       *dstp = dst;
1784     }
1785 }
1786
1787 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1788
1789 int
1790 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1791      struct coding_system *coding;
1792      unsigned char *source, *destination;
1793      int src_bytes, dst_bytes;
1794 {
1795   unsigned char *src = source;
1796   unsigned char *src_end = source + src_bytes;
1797   unsigned char *dst = destination;
1798   unsigned char *dst_end = destination + dst_bytes;
1799   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1800      from DST_END to assure overflow checking is necessary only at the
1801      head of loop.  */
1802   unsigned char *adjusted_dst_end = dst_end - 19;
1803   Lisp_Object translation_table
1804       = coding->translation_table_for_encode;
1805   int result = CODING_FINISH_NORMAL;
1806
1807   if (!NILP (Venable_character_translation) && NILP (translation_table))
1808     translation_table = Vstandard_translation_table_for_encode;
1809
1810   coding->consumed_char = 0;
1811   coding->fake_multibyte = 0;
1812   while (src < src_end && (dst_bytes
1813                            ? (dst < adjusted_dst_end)
1814                            : (dst < src - 19)))
1815     {
1816       /* SRC_BASE remembers the start position in source in each loop.
1817          The loop will be exited when there's not enough source text
1818          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1819          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1820          reset to SRC_BASE before exiting.  */
1821       unsigned char *src_base = src;
1822       int charset, c1, c2, c3, c4;
1823
1824       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1825           && CODING_SPEC_ISO_BOL (coding))
1826         {
1827           /* We have to produce designation sequences if any now.  */
1828           encode_designation_at_bol (coding, translation_table,
1829                                      src, src_end, &dst);
1830           CODING_SPEC_ISO_BOL (coding) = 0;
1831         }
1832
1833       c1 = *src++;
1834       /* If we are seeing a component of a composite character, we are
1835          seeing a leading-code encoded irregularly for composition, or
1836          a composition rule if composing with rule.  We must set C1 to
1837          a normal leading-code or an ASCII code.  If we are not seeing
1838          a composite character, we must reset composition,
1839          designation, and invocation states.  */
1840       if (COMPOSING_P (coding->composing))
1841         {
1842           if (c1 < 0xA0)
1843             {
1844               /* We are not in a composite character any longer.  */
1845               coding->composing = COMPOSING_NO;
1846               ENCODE_RESET_PLANE_AND_REGISTER;
1847               ENCODE_COMPOSITION_END;
1848             }
1849           else
1850             {
1851               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1852                 {
1853                   *dst++ = c1 & 0x7F;
1854                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1855                   continue;
1856                 }
1857               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1858                 coding->composing = COMPOSING_WITH_RULE_RULE;
1859               if (c1 == 0xA0)
1860                 {
1861                   /* This is an ASCII component.  */
1862                   ONE_MORE_BYTE (c1);
1863                   c1 &= 0x7F;
1864                 }
1865               else
1866                 /* This is a leading-code of non ASCII component.  */
1867                 c1 -= 0x20;
1868             }
1869         }
1870
1871       /* Now encode one character.  C1 is a control character, an
1872          ASCII character, or a leading-code of multi-byte character.  */
1873       switch (emacs_code_class[c1])
1874         {
1875         case EMACS_ascii_code:
1876           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1877           break;
1878
1879         case EMACS_control_code:
1880           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1881             ENCODE_RESET_PLANE_AND_REGISTER;
1882           *dst++ = c1;
1883           coding->consumed_char++;
1884           break;
1885
1886         case EMACS_carriage_return_code:
1887           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
1888             {
1889               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1890                 ENCODE_RESET_PLANE_AND_REGISTER;
1891               *dst++ = c1;
1892               coding->consumed_char++;
1893               break;
1894             }
1895           /* fall down to treat '\r' as '\n' ...  */
1896
1897         case EMACS_linefeed_code:
1898           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1899             ENCODE_RESET_PLANE_AND_REGISTER;
1900           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1901             bcopy (coding->spec.iso2022.initial_designation,
1902                    coding->spec.iso2022.current_designation,
1903                    sizeof coding->spec.iso2022.initial_designation);
1904           if (coding->eol_type == CODING_EOL_LF
1905               || coding->eol_type == CODING_EOL_UNDECIDED)
1906             *dst++ = ISO_CODE_LF;
1907           else if (coding->eol_type == CODING_EOL_CRLF)
1908             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1909           else
1910             *dst++ = ISO_CODE_CR;
1911           CODING_SPEC_ISO_BOL (coding) = 1;
1912           coding->consumed_char++;
1913           break;
1914
1915         case EMACS_leading_code_2:
1916           ONE_MORE_BYTE (c2);
1917           if (c2 < 0xA0)
1918             {
1919               /* invalid sequence */
1920               *dst++ = c1;
1921               src--;
1922               coding->consumed_char++;
1923             }
1924           else
1925             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1926           break;
1927
1928         case EMACS_leading_code_3:
1929           TWO_MORE_BYTES (c2, c3);
1930           if (c2 < 0xA0 || c3 < 0xA0)
1931             {
1932               /* invalid sequence */
1933               *dst++ = c1;
1934               src -= 2;
1935               coding->consumed_char++;
1936             }
1937           else if (c1 < LEADING_CODE_PRIVATE_11)
1938             ENCODE_ISO_CHARACTER (c1, c2, c3);
1939           else
1940             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1941           break;
1942
1943         case EMACS_leading_code_4:
1944           THREE_MORE_BYTES (c2, c3, c4);
1945           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1946             {
1947               /* invalid sequence */
1948               *dst++ = c1;
1949               src -= 3;
1950               coding->consumed_char++;
1951             }
1952           else
1953             ENCODE_ISO_CHARACTER (c2, c3, c4);
1954           break;
1955
1956         case EMACS_leading_code_composition:
1957           ONE_MORE_BYTE (c2);
1958           if (c2 < 0xA0)
1959             {
1960               /* invalid sequence */
1961               *dst++ = c1;
1962               src--;
1963               coding->consumed_char++;
1964             }
1965           else if (c2 == 0xFF)
1966             {
1967               ENCODE_RESET_PLANE_AND_REGISTER;
1968               coding->composing = COMPOSING_WITH_RULE_HEAD;
1969               ENCODE_COMPOSITION_WITH_RULE_START;
1970               coding->consumed_char++;
1971             }
1972           else
1973             {
1974               ENCODE_RESET_PLANE_AND_REGISTER;
1975               /* Rewind one byte because it is a character code of
1976                  composition elements.  */
1977               src--;
1978               coding->composing = COMPOSING_NO_RULE_HEAD;
1979               ENCODE_COMPOSITION_NO_RULE_START;
1980               coding->consumed_char++;
1981             }
1982           break;
1983
1984         case EMACS_invalid_code:
1985           *dst++ = c1;
1986           coding->consumed_char++;
1987           break;
1988         }
1989       continue;
1990     label_end_of_loop:
1991       result = CODING_FINISH_INSUFFICIENT_SRC;
1992       src = src_base;
1993       break;
1994     }
1995
1996   if (src < src_end && result == CODING_FINISH_NORMAL)
1997     result = CODING_FINISH_INSUFFICIENT_DST;
1998
1999   /* If this is the last block of the text to be encoded, we must
2000      reset graphic planes and registers to the initial state, and
2001      flush out the carryover if any.  */
2002   if (coding->mode & CODING_MODE_LAST_BLOCK)
2003     {
2004       ENCODE_RESET_PLANE_AND_REGISTER;
2005       if (COMPOSING_P (coding->composing))
2006         ENCODE_COMPOSITION_END;
2007       if (result == CODING_FINISH_INSUFFICIENT_SRC)
2008         {
2009           while (src < src_end && dst < dst_end)
2010             *dst++ = *src++;
2011         }
2012     }
2013   coding->consumed = src - source;
2014   coding->produced = coding->produced_char = dst - destination;
2015   return result;
2016 }
2017
2018 \f
2019 /*** 4. SJIS and BIG5 handlers ***/
2020
2021 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2022    quite widely.  So, for the moment, Emacs supports them in the bare
2023    C code.  But, in the future, they may be supported only by CCL.  */
2024
2025 /* SJIS is a coding system encoding three character sets: ASCII, right
2026    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2027    as is.  A character of charset katakana-jisx0201 is encoded by
2028    "position-code + 0x80".  A character of charset japanese-jisx0208
2029    is encoded in 2-byte but two position-codes are divided and shifted
2030    so that it fit in the range below.
2031
2032    --- CODE RANGE of SJIS ---
2033    (character set)      (range)
2034    ASCII                0x00 .. 0x7F
2035    KATAKANA-JISX0201    0xA0 .. 0xDF
2036    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xEF
2037             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2038    -------------------------------
2039
2040 */
2041
2042 /* BIG5 is a coding system encoding two character sets: ASCII and
2043    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2044    character set and is encoded in two-byte.
2045
2046    --- CODE RANGE of BIG5 ---
2047    (character set)      (range)
2048    ASCII                0x00 .. 0x7F
2049    Big5 (1st byte)      0xA1 .. 0xFE
2050         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2051    --------------------------
2052
2053    Since the number of characters in Big5 is larger than maximum
2054    characters in Emacs' charset (96x96), it can't be handled as one
2055    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2056    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2057    contains frequently used characters and the latter contains less
2058    frequently used characters.  */
2059
2060 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2061    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2062    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2063    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2064
2065 /* Number of Big5 characters which have the same code in 1st byte.  */
2066 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2067
2068 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2069   do {                                                                  \
2070     unsigned int temp                                                   \
2071       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2072     if (b1 < 0xC9)                                                      \
2073       charset = charset_big5_1;                                         \
2074     else                                                                \
2075       {                                                                 \
2076         charset = charset_big5_2;                                       \
2077         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2078       }                                                                 \
2079     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2080     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2081   } while (0)
2082
2083 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2084   do {                                                                  \
2085     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2086     if (charset == charset_big5_2)                                      \
2087       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2088     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2089     b2 = temp % BIG5_SAME_ROW;                                          \
2090     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2091   } while (0)
2092
2093 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
2094   do {                                                                  \
2095     int c_alt, charset_alt = (charset);                                 \
2096     if (!NILP (translation_table)                                       \
2097         && ((c_alt = translate_char (translation_table,                 \
2098                                      -1, (charset), c1, c2)) >= 0))     \
2099       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                          \
2100     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
2101       DECODE_CHARACTER_ASCII (c1);                                      \
2102     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
2103       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
2104     else                                                                \
2105       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
2106   } while (0)
2107
2108 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)             \
2109   do {                                                          \
2110     int c_alt, charset_alt;                                     \
2111     if (!NILP (translation_table)                               \
2112         && ((c_alt = translate_char (translation_table, -1,     \
2113                                      charset, c1, c2))          \
2114             >= 0))                                              \
2115       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                  \
2116     else                                                        \
2117       charset_alt = charset;                                    \
2118     if (charset_alt == charset_ascii)                           \
2119       *dst++ = c1;                                              \
2120     else if (CHARSET_DIMENSION (charset_alt) == 1)              \
2121       {                                                         \
2122         if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2123           *dst++ = c1;                                          \
2124         else                                                    \
2125           {                                                     \
2126             *dst++ = charset_alt, *dst++ = c1;                  \
2127             coding->fake_multibyte = 1;                         \
2128           }                                                     \
2129       }                                                         \
2130     else                                                        \
2131       {                                                         \
2132         c1 &= 0x7F, c2 &= 0x7F;                                 \
2133         if (sjis_p && charset_alt == charset_jisx0208)          \
2134           {                                                     \
2135             unsigned char s1, s2;                               \
2136                                                                 \
2137             ENCODE_SJIS (c1, c2, s1, s2);                       \
2138             *dst++ = s1, *dst++ = s2;                           \
2139             coding->fake_multibyte = 1;                         \
2140           }                                                     \
2141         else if (!sjis_p                                        \
2142                  && (charset_alt == charset_big5_1              \
2143                      || charset_alt == charset_big5_2))         \
2144           {                                                     \
2145             unsigned char b1, b2;                               \
2146                                                                 \
2147             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);          \
2148             *dst++ = b1, *dst++ = b2;                           \
2149           }                                                     \
2150         else                                                    \
2151           {                                                     \
2152             *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;     \
2153             coding->fake_multibyte = 1;                         \
2154           }                                                     \
2155       }                                                         \
2156     coding->consumed_char++;                                    \
2157   } while (0);
2158
2159 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2160    Check if a text is encoded in SJIS.  If it is, return
2161    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2162
2163 int
2164 detect_coding_sjis (src, src_end)
2165      unsigned char *src, *src_end;
2166 {
2167   unsigned char c;
2168
2169   while (src < src_end)
2170     {
2171       c = *src++;
2172       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2173         {
2174           if (src < src_end && *src++ < 0x40)
2175             return 0;
2176         }
2177     }
2178   return CODING_CATEGORY_MASK_SJIS;
2179 }
2180
2181 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2182    Check if a text is encoded in BIG5.  If it is, return
2183    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2184
2185 int
2186 detect_coding_big5 (src, src_end)
2187      unsigned char *src, *src_end;
2188 {
2189   unsigned char c;
2190
2191   while (src < src_end)
2192     {
2193       c = *src++;
2194       if (c >= 0xA1)
2195         {
2196           if (src >= src_end)
2197             break;
2198           c = *src++;
2199           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2200             return 0;
2201         }
2202     }
2203   return CODING_CATEGORY_MASK_BIG5;
2204 }
2205
2206 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2207    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2208
2209 int
2210 decode_coding_sjis_big5 (coding, source, destination,
2211                          src_bytes, dst_bytes, sjis_p)
2212      struct coding_system *coding;
2213      unsigned char *source, *destination;
2214      int src_bytes, dst_bytes;
2215      int sjis_p;
2216 {
2217   unsigned char *src = source;
2218   unsigned char *src_end = source + src_bytes;
2219   unsigned char *dst = destination;
2220   unsigned char *dst_end = destination + dst_bytes;
2221   /* Since the maximum bytes produced by each loop is 4, we subtract 3
2222      from DST_END to assure overflow checking is necessary only at the
2223      head of loop.  */
2224   unsigned char *adjusted_dst_end = dst_end - 3;
2225   Lisp_Object translation_table
2226       = coding->translation_table_for_decode;
2227   int result = CODING_FINISH_NORMAL;
2228
2229   if (!NILP (Venable_character_translation) && NILP (translation_table))
2230     translation_table = Vstandard_translation_table_for_decode;
2231
2232   coding->produced_char = 0;
2233   coding->fake_multibyte = 0;
2234   while (src < src_end && (dst_bytes
2235                            ? (dst < adjusted_dst_end)
2236                            : (dst < src - 3)))
2237     {
2238       /* SRC_BASE remembers the start position in source in each loop.
2239          The loop will be exited when there's not enough source text
2240          to analyze two-byte character (within macro ONE_MORE_BYTE).
2241          In that case, SRC is reset to SRC_BASE before exiting.  */
2242       unsigned char *src_base = src;
2243       unsigned char c1 = *src++, c2, c3, c4;
2244
2245       if (c1 < 0x20)
2246         {
2247           if (c1 == '\r')
2248             {
2249               if (coding->eol_type == CODING_EOL_CRLF)
2250                 {
2251                   ONE_MORE_BYTE (c2);
2252                   if (c2 == '\n')
2253                     *dst++ = c2;
2254                   else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2255                     {
2256                       result = CODING_FINISH_INCONSISTENT_EOL;
2257                       goto label_end_of_loop_2;
2258                     }
2259                   else
2260                     /* To process C2 again, SRC is subtracted by 1.  */
2261                     *dst++ = c1, src--;
2262                 }
2263               else if (coding->eol_type == CODING_EOL_CR)
2264                 *dst++ = '\n';
2265               else
2266                 *dst++ = c1;
2267             }
2268           else if (c1 == '\n'
2269                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2270                    && (coding->eol_type == CODING_EOL_CR
2271                        || coding->eol_type == CODING_EOL_CRLF))
2272             {
2273               result = CODING_FINISH_INCONSISTENT_EOL;
2274               goto label_end_of_loop_2;
2275             }
2276           else
2277             *dst++ = c1;
2278           coding->produced_char++;
2279         }
2280       else if (c1 < 0x80)
2281         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2282       else
2283         {
2284           if (sjis_p)
2285             {
2286               if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
2287                 {
2288                   /* SJIS -> JISX0208 */
2289                   ONE_MORE_BYTE (c2);
2290                   if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
2291                     {
2292                       DECODE_SJIS (c1, c2, c3, c4);
2293                       DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2294                     }
2295                   else
2296                     goto label_invalid_code_2;
2297                 }
2298               else if (c1 < 0xE0)
2299                 /* SJIS -> JISX0201-Kana */
2300                 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2301                                             /* dummy */ c2);
2302               else
2303                 goto label_invalid_code_1;
2304             }
2305           else
2306             {
2307               /* BIG5 -> Big5 */
2308               if (c1 >= 0xA1 && c1 <= 0xFE)
2309                 {
2310                   ONE_MORE_BYTE (c2);
2311                   if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2312                     {
2313                       int charset;
2314
2315                       DECODE_BIG5 (c1, c2, charset, c3, c4);
2316                       DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2317                     }
2318                   else
2319                     goto label_invalid_code_2;
2320                 }
2321               else
2322                 goto label_invalid_code_1;
2323             }
2324         }
2325       continue;
2326
2327     label_invalid_code_1:
2328       *dst++ = c1;
2329       coding->produced_char++;
2330       coding->fake_multibyte = 1;
2331       continue;
2332
2333     label_invalid_code_2:
2334       *dst++ = c1; *dst++= c2;
2335       coding->produced_char += 2;
2336       coding->fake_multibyte = 1;
2337       continue;
2338
2339     label_end_of_loop:
2340       result = CODING_FINISH_INSUFFICIENT_SRC;
2341     label_end_of_loop_2:
2342       src = src_base;
2343       break;
2344     }
2345
2346   if (src < src_end)
2347     {
2348       if (result == CODING_FINISH_NORMAL)
2349         result = CODING_FINISH_INSUFFICIENT_DST;
2350       else if (result != CODING_FINISH_INCONSISTENT_EOL
2351                && coding->mode & CODING_MODE_LAST_BLOCK)
2352         {
2353           src_bytes = src_end - src;
2354           if (dst_bytes && (dst_end - dst < src_bytes))
2355             src_bytes = dst_end - dst;
2356           bcopy (dst, src, src_bytes);
2357           src += src_bytes;
2358           dst += src_bytes;
2359           coding->fake_multibyte = 1;
2360         }
2361     }
2362
2363   coding->consumed = coding->consumed_char = src - source;
2364   coding->produced = dst - destination;
2365   return result;
2366 }
2367
2368 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2369    This function can encode `charset_ascii', `charset_katakana_jisx0201',
2370    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
2371    sure that all these charsets are registered as official charset
2372    (i.e. do not have extended leading-codes).  Characters of other
2373    charsets are produced without any encoding.  If SJIS_P is 1, encode
2374    SJIS text, else encode BIG5 text.  */
2375
2376 int
2377 encode_coding_sjis_big5 (coding, source, destination,
2378                          src_bytes, dst_bytes, sjis_p)
2379      struct coding_system *coding;
2380      unsigned char *source, *destination;
2381      int src_bytes, dst_bytes;
2382      int sjis_p;
2383 {
2384   unsigned char *src = source;
2385   unsigned char *src_end = source + src_bytes;
2386   unsigned char *dst = destination;
2387   unsigned char *dst_end = destination + dst_bytes;
2388   /* Since the maximum bytes produced by each loop is 2, we subtract 1
2389      from DST_END to assure overflow checking is necessary only at the
2390      head of loop.  */
2391   unsigned char *adjusted_dst_end = dst_end - 1;
2392   Lisp_Object translation_table
2393       = coding->translation_table_for_encode;
2394   int result = CODING_FINISH_NORMAL;
2395
2396   if (!NILP (Venable_character_translation) && NILP (translation_table))
2397     translation_table = Vstandard_translation_table_for_encode;
2398
2399   coding->consumed_char = 0;
2400   coding->fake_multibyte = 0;
2401   while (src < src_end && (dst_bytes
2402                            ? (dst < adjusted_dst_end)
2403                            : (dst < src - 1)))
2404     {
2405       /* SRC_BASE remembers the start position in source in each loop.
2406          The loop will be exited when there's not enough source text
2407          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2408          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2409          before exiting.  */
2410       unsigned char *src_base = src;
2411       unsigned char c1 = *src++, c2, c3, c4;
2412
2413       if (coding->composing)
2414         {
2415           if (c1 == 0xA0)
2416             {
2417               ONE_MORE_BYTE (c1);
2418               c1 &= 0x7F;
2419             }
2420           else if (c1 >= 0xA0)
2421             c1 -= 0x20;
2422           else
2423             coding->composing = 0;
2424         }
2425
2426       switch (emacs_code_class[c1])
2427         {
2428         case EMACS_ascii_code:
2429           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2430           break;
2431
2432         case EMACS_control_code:
2433           *dst++ = c1;
2434           coding->consumed_char++;
2435           break;
2436
2437         case EMACS_carriage_return_code:
2438           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2439             {
2440               *dst++ = c1;
2441               coding->consumed_char++;
2442               break;
2443             }
2444           /* fall down to treat '\r' as '\n' ...  */
2445
2446         case EMACS_linefeed_code:
2447           if (coding->eol_type == CODING_EOL_LF
2448               || coding->eol_type == CODING_EOL_UNDECIDED)
2449             *dst++ = '\n';
2450           else if (coding->eol_type == CODING_EOL_CRLF)
2451             *dst++ = '\r', *dst++ = '\n';
2452           else
2453             *dst++ = '\r';
2454           coding->consumed_char++;
2455           break;
2456
2457         case EMACS_leading_code_2:
2458           ONE_MORE_BYTE (c2);
2459           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2460           break;
2461
2462         case EMACS_leading_code_3:
2463           TWO_MORE_BYTES (c2, c3);
2464           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2465           break;
2466
2467         case EMACS_leading_code_4:
2468           THREE_MORE_BYTES (c2, c3, c4);
2469           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2470           break;
2471
2472         case EMACS_leading_code_composition:
2473           coding->composing = 1;
2474           break;
2475
2476         default:                /* i.e. case EMACS_invalid_code: */
2477           *dst++ = c1;
2478           coding->consumed_char++;
2479         }
2480       continue;
2481
2482     label_end_of_loop:
2483       result = CODING_FINISH_INSUFFICIENT_SRC;
2484       src = src_base;
2485       break;
2486     }
2487
2488   if (result == CODING_FINISH_NORMAL
2489       && src < src_end)
2490     result = CODING_FINISH_INSUFFICIENT_DST;
2491   coding->consumed = src - source;
2492   coding->produced = coding->produced_char = dst - destination;
2493   return result;
2494 }
2495
2496 \f
2497 /*** 5. CCL handlers ***/
2498
2499 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2500    Check if a text is encoded in a coding system of which
2501    encoder/decoder are written in CCL program.  If it is, return
2502    CODING_CATEGORY_MASK_CCL, else return 0.  */
2503
2504 int
2505 detect_coding_ccl (src, src_end)
2506      unsigned char *src, *src_end;
2507 {
2508   unsigned char *valid;
2509
2510   /* No coding system is assigned to coding-category-ccl.  */
2511   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2512     return 0;
2513
2514   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2515   while (src < src_end)
2516     {
2517       if (! valid[*src]) return 0;
2518       src++;
2519     }
2520   return CODING_CATEGORY_MASK_CCL;
2521 }
2522
2523 \f
2524 /*** 6. End-of-line handlers ***/
2525
2526 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2527    This function is called only when `coding->eol_type' is
2528    CODING_EOL_CRLF or CODING_EOL_CR.  */
2529
2530 int
2531 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2532      struct coding_system *coding;
2533      unsigned char *source, *destination;
2534      int src_bytes, dst_bytes;
2535 {
2536   unsigned char *src = source;
2537   unsigned char *src_end = source + src_bytes;
2538   unsigned char *dst = destination;
2539   unsigned char *dst_end = destination + dst_bytes;
2540   unsigned char c;
2541   int result = CODING_FINISH_NORMAL;
2542
2543   coding->fake_multibyte = 0;
2544
2545   if (src_bytes <= 0)
2546     return result;
2547
2548   switch (coding->eol_type)
2549     {
2550     case CODING_EOL_CRLF:
2551       {
2552         /* Since the maximum bytes produced by each loop is 2, we
2553            subtract 1 from DST_END to assure overflow checking is
2554            necessary only at the head of loop.  */
2555         unsigned char *adjusted_dst_end = dst_end - 1;
2556
2557         while (src < src_end && (dst_bytes
2558                                  ? (dst < adjusted_dst_end)
2559                                  : (dst < src - 1)))
2560           {
2561             unsigned char *src_base = src;
2562
2563             c = *src++;
2564             if (c == '\r')
2565               {
2566                 ONE_MORE_BYTE (c);
2567                 if (c == '\n')
2568                   *dst++ = c;
2569                 else
2570                   {
2571                     if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2572                       {
2573                         result = CODING_FINISH_INCONSISTENT_EOL;
2574                         goto label_end_of_loop_2;
2575                       }
2576                     src--;
2577                     *dst++ = '\r';
2578                     if (BASE_LEADING_CODE_P (c))
2579                       coding->fake_multibyte = 1;
2580                   }
2581               }
2582             else if (c == '\n'
2583                      && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2584               {
2585                 result = CODING_FINISH_INCONSISTENT_EOL;
2586                 goto label_end_of_loop_2;
2587               }
2588             else
2589               {
2590                 *dst++ = c;
2591                 if (BASE_LEADING_CODE_P (c))
2592                   coding->fake_multibyte = 1;
2593               }
2594             continue;
2595
2596           label_end_of_loop:
2597             result = CODING_FINISH_INSUFFICIENT_SRC;
2598           label_end_of_loop_2:
2599             src = src_base;
2600             break;
2601           }
2602         if (src < src_end)
2603           {
2604             if (result == CODING_FINISH_NORMAL)
2605               result = CODING_FINISH_INSUFFICIENT_DST;
2606             else if (result != CODING_FINISH_INCONSISTENT_EOL
2607                      && coding->mode & CODING_MODE_LAST_BLOCK)
2608               {
2609                 /* This is the last block of the text to be decoded.
2610                    We flush out all remaining codes.  */
2611                 src_bytes = src_end - src;
2612                 if (dst_bytes && (dst_end - dst < src_bytes))
2613                   src_bytes = dst_end - dst;
2614                 bcopy (src, dst, src_bytes);
2615                 dst += src_bytes;
2616                 src += src_bytes;
2617               }
2618           }
2619       }
2620       break;
2621
2622     case CODING_EOL_CR:
2623       if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2624         {
2625           while (src < src_end)
2626             {
2627               if ((c = *src++) == '\n')
2628                 break;
2629               if (BASE_LEADING_CODE_P (c))
2630                 coding->fake_multibyte = 1;
2631             }
2632           if (*--src == '\n')
2633             {
2634               src_bytes = src - source;
2635               result = CODING_FINISH_INCONSISTENT_EOL;
2636             }
2637         }
2638       if (dst_bytes && src_bytes > dst_bytes)
2639         {
2640           result = CODING_FINISH_INSUFFICIENT_DST;
2641           src_bytes = dst_bytes;
2642         }
2643       if (dst_bytes)
2644         bcopy (source, destination, src_bytes);
2645       else
2646         safe_bcopy (source, destination, src_bytes);
2647       src = source + src_bytes;
2648       while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2649       break;
2650
2651     default:                    /* i.e. case: CODING_EOL_LF */
2652       if (dst_bytes && src_bytes > dst_bytes)
2653         {
2654           result = CODING_FINISH_INSUFFICIENT_DST;
2655           src_bytes = dst_bytes;
2656         }
2657       if (dst_bytes)
2658         bcopy (source, destination, src_bytes);
2659       else
2660         safe_bcopy (source, destination, src_bytes);
2661       src += src_bytes;
2662       dst += src_bytes;
2663       coding->fake_multibyte = 1;
2664       break;
2665     }
2666
2667   coding->consumed = coding->consumed_char = src - source;
2668   coding->produced = coding->produced_char = dst - destination;
2669   return result;
2670 }
2671
2672 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2673    format of end-of-line according to `coding->eol_type'.  If
2674    `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2675    '\r' in source text also means end-of-line.  */
2676
2677 int
2678 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2679      struct coding_system *coding;
2680      unsigned char *source, *destination;
2681      int src_bytes, dst_bytes;
2682 {
2683   unsigned char *src = source;
2684   unsigned char *dst = destination;
2685   int result = CODING_FINISH_NORMAL;
2686
2687   coding->fake_multibyte = 0;
2688
2689   if (coding->eol_type == CODING_EOL_CRLF)
2690     {
2691       unsigned char c;
2692       unsigned char *src_end = source + src_bytes;
2693       unsigned char *dst_end = destination + dst_bytes;
2694       /* Since the maximum bytes produced by each loop is 2, we
2695          subtract 1 from DST_END to assure overflow checking is
2696          necessary only at the head of loop.  */
2697       unsigned char *adjusted_dst_end = dst_end - 1;
2698
2699       while (src < src_end && (dst_bytes
2700                                ? (dst < adjusted_dst_end)
2701                                : (dst < src - 1)))
2702         {
2703           c = *src++;
2704           if (c == '\n'
2705               || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2706             *dst++ = '\r', *dst++ = '\n';
2707           else
2708             {
2709               *dst++ = c;
2710               if (BASE_LEADING_CODE_P (c))
2711                 coding->fake_multibyte = 1;
2712             }
2713         }
2714       if (src < src_end)
2715         result = CODING_FINISH_INSUFFICIENT_DST;
2716     }
2717   else
2718     {
2719       unsigned char c;
2720
2721       if (dst_bytes && src_bytes > dst_bytes)
2722         {
2723           src_bytes = dst_bytes;
2724           result = CODING_FINISH_INSUFFICIENT_DST;
2725         }
2726       if (dst_bytes)
2727         bcopy (source, destination, src_bytes);
2728       else
2729         safe_bcopy (source, destination, src_bytes);
2730       dst_bytes = src_bytes;
2731       if (coding->eol_type == CODING_EOL_CR)
2732         {
2733           while (src_bytes--)
2734             {
2735               if ((c = *dst++) == '\n')
2736                 dst[-1] = '\r';
2737               else if (BASE_LEADING_CODE_P (c))
2738                 coding->fake_multibyte = 1;
2739             }
2740         }
2741       else
2742         {
2743           if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2744             {
2745               while (src_bytes--)
2746                 if (*dst++ == '\r') dst[-1] = '\n';
2747             }
2748           coding->fake_multibyte = 1;
2749         }
2750       src = source + dst_bytes;
2751       dst = destination + dst_bytes;
2752     }
2753
2754   coding->consumed = coding->consumed_char = src - source;
2755   coding->produced = coding->produced_char = dst - destination;
2756   return result;
2757 }
2758
2759 \f
2760 /*** 7. C library functions ***/
2761
2762 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2763    has a property `coding-system'.  The value of this property is a
2764    vector of length 5 (called as coding-vector).  Among elements of
2765    this vector, the first (element[0]) and the fifth (element[4])
2766    carry important information for decoding/encoding.  Before
2767    decoding/encoding, this information should be set in fields of a
2768    structure of type `coding_system'.
2769
2770    A value of property `coding-system' can be a symbol of another
2771    subsidiary coding-system.  In that case, Emacs gets coding-vector
2772    from that symbol.
2773
2774    `element[0]' contains information to be set in `coding->type'.  The
2775    value and its meaning is as follows:
2776
2777    0 -- coding_type_emacs_mule
2778    1 -- coding_type_sjis
2779    2 -- coding_type_iso2022
2780    3 -- coding_type_big5
2781    4 -- coding_type_ccl encoder/decoder written in CCL
2782    nil -- coding_type_no_conversion
2783    t -- coding_type_undecided (automatic conversion on decoding,
2784                                no-conversion on encoding)
2785
2786    `element[4]' contains information to be set in `coding->flags' and
2787    `coding->spec'.  The meaning varies by `coding->type'.
2788
2789    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2790    of length 32 (of which the first 13 sub-elements are used now).
2791    Meanings of these sub-elements are:
2792
2793    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2794         If the value is an integer of valid charset, the charset is
2795         assumed to be designated to graphic register N initially.
2796
2797         If the value is minus, it is a minus value of charset which
2798         reserves graphic register N, which means that the charset is
2799         not designated initially but should be designated to graphic
2800         register N just before encoding a character in that charset.
2801
2802         If the value is nil, graphic register N is never used on
2803         encoding.
2804
2805    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2806         Each value takes t or nil.  See the section ISO2022 of
2807         `coding.h' for more information.
2808
2809    If `coding->type' is `coding_type_big5', element[4] is t to denote
2810    BIG5-ETen or nil to denote BIG5-HKU.
2811
2812    If `coding->type' takes the other value, element[4] is ignored.
2813
2814    Emacs Lisp's coding system also carries information about format of
2815    end-of-line in a value of property `eol-type'.  If the value is
2816    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2817    means CODING_EOL_CR.  If it is not integer, it should be a vector
2818    of subsidiary coding systems of which property `eol-type' has one
2819    of above values.
2820
2821 */
2822
2823 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2824    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2825    is setup so that no conversion is necessary and return -1, else
2826    return 0.  */
2827
2828 int
2829 setup_coding_system (coding_system, coding)
2830      Lisp_Object coding_system;
2831      struct coding_system *coding;
2832 {
2833   Lisp_Object coding_spec, coding_type, eol_type, plist;
2834   Lisp_Object val;
2835   int i;
2836
2837   /* Initialize some fields required for all kinds of coding systems.  */
2838   coding->symbol = coding_system;
2839   coding->common_flags = 0;
2840   coding->mode = 0;
2841   coding->heading_ascii = -1;
2842   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2843   coding_spec = Fget (coding_system, Qcoding_system);
2844   if (!VECTORP (coding_spec)
2845       || XVECTOR (coding_spec)->size != 5
2846       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2847     goto label_invalid_coding_system;
2848
2849   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2850   if (VECTORP (eol_type))
2851     {
2852       coding->eol_type = CODING_EOL_UNDECIDED;
2853       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2854     }
2855   else if (XFASTINT (eol_type) == 1)
2856     {
2857       coding->eol_type = CODING_EOL_CRLF;
2858       coding->common_flags
2859         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2860     }
2861   else if (XFASTINT (eol_type) == 2)
2862     {
2863       coding->eol_type = CODING_EOL_CR;
2864       coding->common_flags
2865         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2866     }
2867   else
2868     coding->eol_type = CODING_EOL_LF;
2869
2870   coding_type = XVECTOR (coding_spec)->contents[0];
2871   /* Try short cut.  */
2872   if (SYMBOLP (coding_type))
2873     {
2874       if (EQ (coding_type, Qt))
2875         {
2876           coding->type = coding_type_undecided;
2877           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2878         }
2879       else
2880         coding->type = coding_type_no_conversion;
2881       return 0;
2882     }
2883
2884   /* Initialize remaining fields.  */
2885   coding->composing = 0;
2886
2887   /* Get values of coding system properties:
2888      `post-read-conversion', `pre-write-conversion',
2889      `translation-table-for-decode', `translation-table-for-encode'.  */
2890   plist = XVECTOR (coding_spec)->contents[3];
2891   coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2892   coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2893   val = Fplist_get (plist, Qtranslation_table_for_decode);
2894   if (SYMBOLP (val))
2895     val = Fget (val, Qtranslation_table_for_decode);
2896   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2897   val = Fplist_get (plist, Qtranslation_table_for_encode);
2898   if (SYMBOLP (val))
2899     val = Fget (val, Qtranslation_table_for_encode);
2900   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2901   val = Fplist_get (plist, Qcoding_category);
2902   if (!NILP (val))
2903     {
2904       val = Fget (val, Qcoding_category_index);
2905       if (INTEGERP (val))
2906         coding->category_idx = XINT (val);
2907       else
2908         goto label_invalid_coding_system;
2909     }
2910   else
2911     goto label_invalid_coding_system;
2912
2913   val = Fplist_get (plist, Qsafe_charsets);
2914   if (EQ (val, Qt))
2915     {
2916       for (i = 0; i <= MAX_CHARSET; i++)
2917         coding->safe_charsets[i] = 1;
2918     }
2919   else
2920     {
2921       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2922       while (CONSP (val))
2923         {
2924           if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2925             coding->safe_charsets[i] = 1;
2926           val = XCONS (val)->cdr;
2927         }
2928     }
2929
2930   switch (XFASTINT (coding_type))
2931     {
2932     case 0:
2933       coding->type = coding_type_emacs_mule;
2934       if (!NILP (coding->post_read_conversion))
2935         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2936       if (!NILP (coding->pre_write_conversion))
2937         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2938       break;
2939
2940     case 1:
2941       coding->type = coding_type_sjis;
2942       coding->common_flags
2943         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2944       break;
2945
2946     case 2:
2947       coding->type = coding_type_iso2022;
2948       coding->common_flags
2949         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2950       {
2951         Lisp_Object val, temp;
2952         Lisp_Object *flags;
2953         int i, charset, reg_bits = 0;
2954
2955         val = XVECTOR (coding_spec)->contents[4];
2956
2957         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2958           goto label_invalid_coding_system;
2959
2960         flags = XVECTOR (val)->contents;
2961         coding->flags
2962           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2963              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2964              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2965              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2966              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2967              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2968              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2969              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2970              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2971              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2972              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2973              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2974              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2975              );
2976
2977         /* Invoke graphic register 0 to plane 0.  */
2978         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2979         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2980         CODING_SPEC_ISO_INVOCATION (coding, 1)
2981           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2982         /* Not single shifting at first.  */
2983         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2984         /* Beginning of buffer should also be regarded as bol. */
2985         CODING_SPEC_ISO_BOL (coding) = 1;
2986
2987         for (charset = 0; charset <= MAX_CHARSET; charset++)
2988           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2989         val = Vcharset_revision_alist;
2990         while (CONSP (val))
2991           {
2992             charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2993             if (charset >= 0
2994                 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2995                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2996               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2997             val = XCONS (val)->cdr;
2998           }
2999
3000         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3001            FLAGS[REG] can be one of below:
3002                 integer CHARSET: CHARSET occupies register I,
3003                 t: designate nothing to REG initially, but can be used
3004                   by any charsets,
3005                 list of integer, nil, or t: designate the first
3006                   element (if integer) to REG initially, the remaining
3007                   elements (if integer) is designated to REG on request,
3008                   if an element is t, REG can be used by any charsets,
3009                 nil: REG is never used.  */
3010         for (charset = 0; charset <= MAX_CHARSET; charset++)
3011           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3012             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3013         for (i = 0; i < 4; i++)
3014           {
3015             if (INTEGERP (flags[i])
3016                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3017                 || (charset = get_charset_id (flags[i])) >= 0)
3018               {
3019                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3020                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3021               }
3022             else if (EQ (flags[i], Qt))
3023               {
3024                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3025                 reg_bits |= 1 << i;
3026                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3027               }
3028             else if (CONSP (flags[i]))
3029               {
3030                 Lisp_Object tail;
3031                 tail = flags[i];
3032
3033                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3034                 if (INTEGERP (XCONS (tail)->car)
3035                     && (charset = XINT (XCONS (tail)->car),
3036                         CHARSET_VALID_P (charset))
3037                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
3038                   {
3039                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3040                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3041                   }
3042                 else
3043                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3044                 tail = XCONS (tail)->cdr;
3045                 while (CONSP (tail))
3046                   {
3047                     if (INTEGERP (XCONS (tail)->car)
3048                         && (charset = XINT (XCONS (tail)->car),
3049                             CHARSET_VALID_P (charset))
3050                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
3051                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3052                         = i;
3053                     else if (EQ (XCONS (tail)->car, Qt))
3054                       reg_bits |= 1 << i;
3055                     tail = XCONS (tail)->cdr;
3056                   }
3057               }
3058             else
3059               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3060
3061             CODING_SPEC_ISO_DESIGNATION (coding, i)
3062               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3063           }
3064
3065         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3066           {
3067             /* REG 1 can be used only by locking shift in 7-bit env.  */
3068             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3069               reg_bits &= ~2;
3070             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3071               /* Without any shifting, only REG 0 and 1 can be used.  */
3072               reg_bits &= 3;
3073           }
3074
3075         if (reg_bits)
3076           for (charset = 0; charset <= MAX_CHARSET; charset++)
3077             {
3078               if (CHARSET_VALID_P (charset))
3079                 {
3080                   /* There exist some default graphic registers to be
3081                      used CHARSET.  */
3082
3083                   /* We had better avoid designating a charset of
3084                      CHARS96 to REG 0 as far as possible.  */
3085                   if (CHARSET_CHARS (charset) == 96)
3086                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3087                       = (reg_bits & 2
3088                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3089                   else
3090                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3091                       = (reg_bits & 1
3092                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3093                 }
3094             }
3095       }
3096       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3097       coding->spec.iso2022.last_invalid_designation_register = -1;
3098       break;
3099
3100     case 3:
3101       coding->type = coding_type_big5;
3102       coding->common_flags
3103         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3104       coding->flags
3105         = (NILP (XVECTOR (coding_spec)->contents[4])
3106            ? CODING_FLAG_BIG5_HKU
3107            : CODING_FLAG_BIG5_ETEN);
3108       break;
3109
3110     case 4:
3111       coding->type = coding_type_ccl;
3112       coding->common_flags
3113         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3114       {
3115         Lisp_Object val;
3116         Lisp_Object decoder, encoder;
3117
3118         val = XVECTOR (coding_spec)->contents[4];
3119         if (CONSP  (val)
3120             && SYMBOLP (XCONS (val)->car)
3121             && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx))
3122             && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder)))
3123             && SYMBOLP (XCONS (val)->cdr)
3124             && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx))
3125             && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder))))
3126           {
3127             setup_ccl_program (&(coding->spec.ccl.decoder), decoder);
3128             setup_ccl_program (&(coding->spec.ccl.encoder), encoder);
3129           }
3130         else
3131           goto label_invalid_coding_system;
3132
3133         bzero (coding->spec.ccl.valid_codes, 256);
3134         val = Fplist_get (plist, Qvalid_codes);
3135         if (CONSP (val))
3136           {
3137             Lisp_Object this;
3138
3139             for (; CONSP (val); val = XCONS (val)->cdr)
3140               {
3141                 this = XCONS (val)->car;
3142                 if (INTEGERP (this)
3143                     && XINT (this) >= 0 && XINT (this) < 256)
3144                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3145                 else if (CONSP (this)
3146                          && INTEGERP (XCONS (this)->car)
3147                          && INTEGERP (XCONS (this)->cdr))
3148                   {
3149                     int start = XINT (XCONS (this)->car);
3150                     int end = XINT (XCONS (this)->cdr);
3151
3152                     if (start >= 0 && start <= end && end < 256)
3153                       while (start <= end)
3154                         coding->spec.ccl.valid_codes[start++] = 1;
3155                   }
3156               }
3157           }
3158       }
3159       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3160       break;
3161
3162     case 5:
3163       coding->type = coding_type_raw_text;
3164       break;
3165
3166     default:
3167       goto label_invalid_coding_system;
3168     }
3169   return 0;
3170
3171  label_invalid_coding_system:
3172   coding->type = coding_type_no_conversion;
3173   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3174   coding->common_flags = 0;
3175   coding->eol_type = CODING_EOL_LF;
3176   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3177   return -1;
3178 }
3179
3180 /* Setup raw-text or one of its subsidiaries in the structure
3181    coding_system CODING according to the already setup value eol_type
3182    in CODING.  CODING should be setup for some coding system in
3183    advance.  */
3184
3185 void
3186 setup_raw_text_coding_system (coding)
3187      struct coding_system *coding;
3188 {
3189   if (coding->type != coding_type_raw_text)
3190     {
3191       coding->symbol = Qraw_text;
3192       coding->type = coding_type_raw_text;
3193       if (coding->eol_type != CODING_EOL_UNDECIDED)
3194         {
3195           Lisp_Object subsidiaries;
3196           subsidiaries = Fget (Qraw_text, Qeol_type);
3197
3198           if (VECTORP (subsidiaries)
3199               && XVECTOR (subsidiaries)->size == 3)
3200             coding->symbol
3201               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3202         }
3203     }
3204   return;
3205 }
3206
3207 /* Emacs has a mechanism to automatically detect a coding system if it
3208    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3209    it's impossible to distinguish some coding systems accurately
3210    because they use the same range of codes.  So, at first, coding
3211    systems are categorized into 7, those are:
3212
3213    o coding-category-emacs-mule
3214
3215         The category for a coding system which has the same code range
3216         as Emacs' internal format.  Assigned the coding-system (Lisp
3217         symbol) `emacs-mule' by default.
3218
3219    o coding-category-sjis
3220
3221         The category for a coding system which has the same code range
3222         as SJIS.  Assigned the coding-system (Lisp
3223         symbol) `japanese-shift-jis' by default.
3224
3225    o coding-category-iso-7
3226
3227         The category for a coding system which has the same code range
3228         as ISO2022 of 7-bit environment.  This doesn't use any locking
3229         shift and single shift functions.  This can encode/decode all
3230         charsets.  Assigned the coding-system (Lisp symbol)
3231         `iso-2022-7bit' by default.
3232
3233    o coding-category-iso-7-tight
3234
3235         Same as coding-category-iso-7 except that this can
3236         encode/decode only the specified charsets.
3237
3238    o coding-category-iso-8-1
3239
3240         The category for a coding system which has the same code range
3241         as ISO2022 of 8-bit environment and graphic plane 1 used only
3242         for DIMENSION1 charset.  This doesn't use any locking shift
3243         and single shift functions.  Assigned the coding-system (Lisp
3244         symbol) `iso-latin-1' by default.
3245
3246    o coding-category-iso-8-2
3247
3248         The category for a coding system which has the same code range
3249         as ISO2022 of 8-bit environment and graphic plane 1 used only
3250         for DIMENSION2 charset.  This doesn't use any locking shift
3251         and single shift functions.  Assigned the coding-system (Lisp
3252         symbol) `japanese-iso-8bit' by default.
3253
3254    o coding-category-iso-7-else
3255
3256         The category for a coding system which has the same code range
3257         as ISO2022 of 7-bit environemnt but uses locking shift or
3258         single shift functions.  Assigned the coding-system (Lisp
3259         symbol) `iso-2022-7bit-lock' by default.
3260
3261    o coding-category-iso-8-else
3262
3263         The category for a coding system which has the same code range
3264         as ISO2022 of 8-bit environemnt but uses locking shift or
3265         single shift functions.  Assigned the coding-system (Lisp
3266         symbol) `iso-2022-8bit-ss2' by default.
3267
3268    o coding-category-big5
3269
3270         The category for a coding system which has the same code range
3271         as BIG5.  Assigned the coding-system (Lisp symbol)
3272         `cn-big5' by default.
3273
3274    o coding-category-ccl
3275
3276         The category for a coding system of which encoder/decoder is
3277         written in CCL programs.  The default value is nil, i.e., no
3278         coding system is assigned.
3279
3280    o coding-category-binary
3281
3282         The category for a coding system not categorized in any of the
3283         above.  Assigned the coding-system (Lisp symbol)
3284         `no-conversion' by default.
3285
3286    Each of them is a Lisp symbol and the value is an actual
3287    `coding-system's (this is also a Lisp symbol) assigned by a user.
3288    What Emacs does actually is to detect a category of coding system.
3289    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3290    decide only one possible category, it selects a category of the
3291    highest priority.  Priorities of categories are also specified by a
3292    user in a Lisp variable `coding-category-list'.
3293
3294 */
3295
3296 static
3297 int ascii_skip_code[256];
3298
3299 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3300    If it detects possible coding systems, return an integer in which
3301    appropriate flag bits are set.  Flag bits are defined by macros
3302    CODING_CATEGORY_MASK_XXX in `coding.h'.
3303
3304    How many ASCII characters are at the head is returned as *SKIP.  */
3305
3306 static int
3307 detect_coding_mask (source, src_bytes, priorities, skip)
3308      unsigned char *source;
3309      int src_bytes, *priorities, *skip;
3310 {
3311   register unsigned char c;
3312   unsigned char *src = source, *src_end = source + src_bytes;
3313   unsigned int mask;
3314   int i;
3315
3316   /* At first, skip all ASCII characters and control characters except
3317      for three ISO2022 specific control characters.  */
3318   ascii_skip_code[ISO_CODE_SO] = 0;
3319   ascii_skip_code[ISO_CODE_SI] = 0;
3320   ascii_skip_code[ISO_CODE_ESC] = 0;
3321
3322  label_loop_detect_coding:
3323   while (src < src_end && ascii_skip_code[*src]) src++;
3324   *skip = src - source;
3325
3326   if (src >= src_end)
3327     /* We found nothing other than ASCII.  There's nothing to do.  */
3328     return 0;
3329
3330   c = *src;
3331   /* The text seems to be encoded in some multilingual coding system.
3332      Now, try to find in which coding system the text is encoded.  */
3333   if (c < 0x80)
3334     {
3335       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3336       /* C is an ISO2022 specific control code of C0.  */
3337       mask = detect_coding_iso2022 (src, src_end);
3338       if (mask == 0)
3339         {
3340           /* No valid ISO2022 code follows C.  Try again.  */
3341           src++;
3342           if (c == ISO_CODE_ESC)
3343             ascii_skip_code[ISO_CODE_ESC] = 1;
3344           else
3345             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3346           goto label_loop_detect_coding;
3347         }
3348       if (priorities)
3349         goto label_return_highest_only;
3350     }
3351   else
3352     {
3353       int try;
3354
3355       if (c < 0xA0)
3356         {
3357           /* C is the first byte of SJIS character code,
3358              or a leading-code of Emacs' internal format (emacs-mule).  */
3359           try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3360
3361           /* Or, if C is a special latin extra code,
3362              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3363              or is an ISO2022 control-sequence-introducer (CSI),
3364              we should also consider the possibility of ISO2022 codings.  */
3365           if ((VECTORP (Vlatin_extra_code_table)
3366                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3367               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3368               || (c == ISO_CODE_CSI
3369                   && (src < src_end
3370                       && (*src == ']'
3371                           || ((*src == '0' || *src == '1' || *src == '2')
3372                               && src + 1 < src_end
3373                               && src[1] == ']')))))
3374             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3375                      | CODING_CATEGORY_MASK_ISO_8BIT);
3376         }
3377       else
3378         /* C is a character of ISO2022 in graphic plane right,
3379            or a SJIS's 1-byte character code (i.e. JISX0201),
3380            or the first byte of BIG5's 2-byte code.  */
3381         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3382                 | CODING_CATEGORY_MASK_ISO_8BIT
3383                 | CODING_CATEGORY_MASK_SJIS
3384                 | CODING_CATEGORY_MASK_BIG5);
3385
3386       /* Or, we may have to consider the possibility of CCL.  */
3387       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3388           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3389               ->spec.ccl.valid_codes)[c])
3390         try |= CODING_CATEGORY_MASK_CCL;
3391
3392       mask = 0;
3393       if (priorities)
3394         {
3395           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3396             {
3397               if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
3398                 mask = detect_coding_iso2022 (src, src_end);
3399               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3400                 mask = detect_coding_sjis (src, src_end);
3401               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3402                 mask = detect_coding_big5 (src, src_end);
3403               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3404                 mask = detect_coding_emacs_mule (src, src_end);
3405               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3406                 mask = detect_coding_ccl (src, src_end);
3407               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3408                 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3409               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3410                 mask = CODING_CATEGORY_MASK_BINARY;
3411               if (mask)
3412                 goto label_return_highest_only;
3413             }
3414           return CODING_CATEGORY_MASK_RAW_TEXT;
3415         }
3416       if (try & CODING_CATEGORY_MASK_ISO)
3417         mask |= detect_coding_iso2022 (src, src_end);
3418       if (try & CODING_CATEGORY_MASK_SJIS)
3419         mask |= detect_coding_sjis (src, src_end);
3420       if (try & CODING_CATEGORY_MASK_BIG5)
3421         mask |= detect_coding_big5 (src, src_end);
3422       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3423         mask |= detect_coding_emacs_mule (src, src_end);
3424       if (try & CODING_CATEGORY_MASK_CCL)
3425         mask |= detect_coding_ccl (src, src_end);
3426     }
3427   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3428
3429  label_return_highest_only:
3430   for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3431     {
3432       if (mask & priorities[i])
3433         return priorities[i];
3434     }
3435   return CODING_CATEGORY_MASK_RAW_TEXT;
3436 }
3437
3438 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3439    The information of the detected coding system is set in CODING.  */
3440
3441 void
3442 detect_coding (coding, src, src_bytes)
3443      struct coding_system *coding;
3444      unsigned char *src;
3445      int src_bytes;
3446 {
3447   unsigned int idx;
3448   int skip, mask, i;
3449   Lisp_Object val;
3450
3451   val = Vcoding_category_list;
3452   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3453   coding->heading_ascii = skip;
3454
3455   if (!mask) return;
3456
3457   /* We found a single coding system of the highest priority in MASK.  */
3458   idx = 0;
3459   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3460   if (! mask)
3461     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3462
3463   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3464
3465   if (coding->eol_type != CODING_EOL_UNDECIDED)
3466     {
3467       Lisp_Object tmp;
3468
3469       tmp = Fget (val, Qeol_type);
3470       if (VECTORP (tmp))
3471         val = XVECTOR (tmp)->contents[coding->eol_type];
3472     }
3473   setup_coding_system (val, coding);
3474   /* Set this again because setup_coding_system reset this member.  */
3475   coding->heading_ascii = skip;
3476 }
3477
3478 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3479    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3480    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3481
3482    How many non-eol characters are at the head is returned as *SKIP.  */
3483
3484 #define MAX_EOL_CHECK_COUNT 3
3485
3486 static int
3487 detect_eol_type (source, src_bytes, skip)
3488      unsigned char *source;
3489      int src_bytes, *skip;
3490 {
3491   unsigned char *src = source, *src_end = src + src_bytes;
3492   unsigned char c;
3493   int total = 0;                /* How many end-of-lines are found so far.  */
3494   int eol_type = CODING_EOL_UNDECIDED;
3495   int this_eol_type;
3496
3497   *skip = 0;
3498
3499   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3500     {
3501       c = *src++;
3502       if (c == '\n' || c == '\r')
3503         {
3504           if (*skip == 0)
3505             *skip = src - 1 - source;
3506           total++;
3507           if (c == '\n')
3508             this_eol_type = CODING_EOL_LF;
3509           else if (src >= src_end || *src != '\n')
3510             this_eol_type = CODING_EOL_CR;
3511           else
3512             this_eol_type = CODING_EOL_CRLF, src++;
3513
3514           if (eol_type == CODING_EOL_UNDECIDED)
3515             /* This is the first end-of-line.  */
3516             eol_type = this_eol_type;
3517           else if (eol_type != this_eol_type)
3518             {
3519               /* The found type is different from what found before.  */
3520               eol_type = CODING_EOL_INCONSISTENT;
3521               break;
3522             }
3523         }
3524     }
3525
3526   if (*skip == 0)
3527     *skip = src_end - source;
3528   return eol_type;
3529 }
3530
3531 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3532    is encoded.  If it detects an appropriate format of end-of-line, it
3533    sets the information in *CODING.  */
3534
3535 void
3536 detect_eol (coding, src, src_bytes)
3537      struct coding_system *coding;
3538      unsigned char *src;
3539      int src_bytes;
3540 {
3541   Lisp_Object val;
3542   int skip;
3543   int eol_type = detect_eol_type (src, src_bytes, &skip);
3544
3545   if (coding->heading_ascii > skip)
3546     coding->heading_ascii = skip;
3547   else
3548     skip = coding->heading_ascii;
3549
3550   if (eol_type == CODING_EOL_UNDECIDED)
3551     return;
3552   if (eol_type == CODING_EOL_INCONSISTENT)
3553     {
3554 #if 0
3555       /* This code is suppressed until we find a better way to
3556          distinguish raw text file and binary file.  */
3557
3558       /* If we have already detected that the coding is raw-text, the
3559          coding should actually be no-conversion.  */
3560       if (coding->type == coding_type_raw_text)
3561         {
3562           setup_coding_system (Qno_conversion, coding);
3563           return;
3564         }
3565       /* Else, let's decode only text code anyway.  */
3566 #endif /* 0 */
3567       eol_type = CODING_EOL_LF;
3568     }
3569
3570   val = Fget (coding->symbol, Qeol_type);
3571   if (VECTORP (val) && XVECTOR (val)->size == 3)
3572     {
3573       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3574       coding->heading_ascii = skip;
3575     }
3576 }
3577
3578 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3579
3580 #define DECODING_BUFFER_MAG(coding)                                          \
3581   (coding->type == coding_type_iso2022                                       \
3582    ? 3                                                                       \
3583    : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3584       ? 2                                                                    \
3585       : (coding->type == coding_type_raw_text                                \
3586          ? 1                                                                 \
3587          : (coding->type == coding_type_ccl                                  \
3588             ? coding->spec.ccl.decoder.buf_magnification                     \
3589             : 2))))
3590
3591 /* Return maximum size (bytes) of a buffer enough for decoding
3592    SRC_BYTES of text encoded in CODING.  */
3593
3594 int
3595 decoding_buffer_size (coding, src_bytes)
3596      struct coding_system *coding;
3597      int src_bytes;
3598 {
3599   return (src_bytes * DECODING_BUFFER_MAG (coding)
3600           + CONVERSION_BUFFER_EXTRA_ROOM);
3601 }
3602
3603 /* Return maximum size (bytes) of a buffer enough for encoding
3604    SRC_BYTES of text to CODING.  */
3605
3606 int
3607 encoding_buffer_size (coding, src_bytes)
3608      struct coding_system *coding;
3609      int src_bytes;
3610 {
3611   int magnification;
3612
3613   if (coding->type == coding_type_ccl)
3614     magnification = coding->spec.ccl.encoder.buf_magnification;
3615   else
3616     magnification = 3;
3617
3618   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3619 }
3620
3621 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3622 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3623 #endif
3624
3625 char *conversion_buffer;
3626 int conversion_buffer_size;
3627
3628 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3629    or decoding.  Sufficient memory is allocated automatically.  If we
3630    run out of memory, return NULL.  */
3631
3632 char *
3633 get_conversion_buffer (size)
3634      int size;
3635 {
3636   if (size > conversion_buffer_size)
3637     {
3638       char *buf;
3639       int real_size = conversion_buffer_size * 2;
3640
3641       while (real_size < size) real_size *= 2;
3642       buf = (char *) xmalloc (real_size);
3643       xfree (conversion_buffer);
3644       conversion_buffer = buf;
3645       conversion_buffer_size = real_size;
3646     }
3647   return conversion_buffer;
3648 }
3649
3650 int
3651 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3652      struct coding_system *coding;
3653      unsigned char *source, *destination;
3654      int src_bytes, dst_bytes, encodep;
3655 {
3656   struct ccl_program *ccl
3657     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3658   int result;
3659
3660   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3661
3662   coding->produced = ccl_driver (ccl, source, destination,
3663                                  src_bytes, dst_bytes, &(coding->consumed));
3664   coding->produced_char
3665     = multibyte_chars_in_text (destination, coding->produced);
3666   coding->consumed_char
3667     = multibyte_chars_in_text (source, coding->consumed);
3668
3669   switch (ccl->status)
3670     {
3671     case CCL_STAT_SUSPEND_BY_SRC:
3672       result = CODING_FINISH_INSUFFICIENT_SRC;
3673       break;
3674     case CCL_STAT_SUSPEND_BY_DST:
3675       result = CODING_FINISH_INSUFFICIENT_DST;
3676       break;
3677     case CCL_STAT_QUIT:
3678     case CCL_STAT_INVALID_CMD:
3679       result = CODING_FINISH_INTERRUPT;
3680       break;
3681     default:
3682       result = CODING_FINISH_NORMAL;
3683       break;
3684     }
3685   return result;
3686 }
3687
3688 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
3689    decoding, it may detect coding system and format of end-of-line if
3690    those are not yet decided.  */
3691
3692 int
3693 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3694      struct coding_system *coding;
3695      unsigned char *source, *destination;
3696      int src_bytes, dst_bytes;
3697 {
3698   int result;
3699
3700   if (src_bytes <= 0
3701       && coding->type != coding_type_ccl
3702       && ! (coding->mode & CODING_MODE_LAST_BLOCK
3703             && CODING_REQUIRE_FLUSHING (coding)))
3704     {
3705       coding->produced = coding->produced_char = 0;
3706       coding->consumed = coding->consumed_char = 0;
3707       coding->fake_multibyte = 0;
3708       return CODING_FINISH_NORMAL;
3709     }
3710
3711   if (coding->type == coding_type_undecided)
3712     detect_coding (coding, source, src_bytes);
3713
3714   if (coding->eol_type == CODING_EOL_UNDECIDED)
3715     detect_eol (coding, source, src_bytes);
3716
3717   switch (coding->type)
3718     {
3719     case coding_type_emacs_mule:
3720     case coding_type_undecided:
3721     case coding_type_raw_text:
3722       if (coding->eol_type == CODING_EOL_LF
3723           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3724         goto label_no_conversion;
3725       result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3726       break;
3727
3728     case coding_type_sjis:
3729       result = decode_coding_sjis_big5 (coding, source, destination,
3730                                         src_bytes, dst_bytes, 1);
3731       break;
3732
3733     case coding_type_iso2022:
3734       result = decode_coding_iso2022 (coding, source, destination,
3735                                       src_bytes, dst_bytes);
3736       break;
3737
3738     case coding_type_big5:
3739       result = decode_coding_sjis_big5 (coding, source, destination,
3740                                         src_bytes, dst_bytes, 0);
3741       break;
3742
3743     case coding_type_ccl:
3744       result = ccl_coding_driver (coding, source, destination,
3745                                   src_bytes, dst_bytes, 0);
3746       break;
3747
3748     default:                    /* i.e. case coding_type_no_conversion: */
3749     label_no_conversion:
3750       if (dst_bytes && src_bytes > dst_bytes)
3751         {
3752           coding->produced = dst_bytes;
3753           result = CODING_FINISH_INSUFFICIENT_DST;
3754         }
3755       else
3756         {
3757           coding->produced = src_bytes;
3758           result = CODING_FINISH_NORMAL;
3759         }
3760       if (dst_bytes)
3761         bcopy (source, destination, coding->produced);
3762       else
3763         safe_bcopy (source, destination, coding->produced);
3764       coding->fake_multibyte = 1;
3765       coding->consumed
3766         = coding->consumed_char = coding->produced_char = coding->produced;
3767       break;
3768     }
3769
3770   return result;
3771 }
3772
3773 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
3774
3775 int
3776 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3777      struct coding_system *coding;
3778      unsigned char *source, *destination;
3779      int src_bytes, dst_bytes;
3780 {
3781   int result;
3782
3783   if (src_bytes <= 0
3784       && ! (coding->mode & CODING_MODE_LAST_BLOCK
3785             && CODING_REQUIRE_FLUSHING (coding)))
3786     {
3787       coding->produced = coding->produced_char = 0;
3788       coding->consumed = coding->consumed_char = 0;
3789       coding->fake_multibyte = 0;
3790       return CODING_FINISH_NORMAL;
3791     }
3792
3793   switch (coding->type)
3794     {
3795     case coding_type_emacs_mule:
3796     case coding_type_undecided:
3797     case coding_type_raw_text:
3798       if (coding->eol_type == CODING_EOL_LF
3799           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3800         goto label_no_conversion;
3801       result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
3802       break;
3803
3804     case coding_type_sjis:
3805       result = encode_coding_sjis_big5 (coding, source, destination,
3806                                         src_bytes, dst_bytes, 1);
3807       break;
3808
3809     case coding_type_iso2022:
3810       result = encode_coding_iso2022 (coding, source, destination,
3811                                       src_bytes, dst_bytes);
3812       break;
3813
3814     case coding_type_big5:
3815       result = encode_coding_sjis_big5 (coding, source, destination,
3816                                         src_bytes, dst_bytes, 0);
3817       break;
3818
3819     case coding_type_ccl:
3820       result = ccl_coding_driver (coding, source, destination,
3821                                   src_bytes, dst_bytes, 1);
3822       break;
3823
3824     default:                    /* i.e. case coding_type_no_conversion: */
3825     label_no_conversion:
3826       if (dst_bytes && src_bytes > dst_bytes)
3827         {
3828           coding->produced = dst_bytes;
3829           result = CODING_FINISH_INSUFFICIENT_DST;
3830         }
3831       else
3832         {
3833           coding->produced = src_bytes;
3834           result = CODING_FINISH_NORMAL;
3835         }
3836       if (dst_bytes)
3837         bcopy (source, destination, coding->produced);
3838       else
3839         safe_bcopy (source, destination, coding->produced);
3840       if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3841         {
3842           unsigned char *p = destination, *pend = p + coding->produced;
3843           while (p < pend)
3844             if (*p++ == '\015') p[-1] = '\n';
3845         }
3846       coding->fake_multibyte = 1;
3847       coding->consumed
3848         = coding->consumed_char = coding->produced_char = coding->produced;
3849       break;
3850     }
3851
3852   return result;
3853 }
3854
3855 /* Scan text in the region between *BEG and *END (byte positions),
3856    skip characters which we don't have to decode by coding system
3857    CODING at the head and tail, then set *BEG and *END to the region
3858    of the text we actually have to convert.  The caller should move
3859    the gap out of the region in advance.
3860
3861    If STR is not NULL, *BEG and *END are indices into STR.  */
3862
3863 static void
3864 shrink_decoding_region (beg, end, coding, str)
3865      int *beg, *end;
3866      struct coding_system *coding;
3867      unsigned char *str;
3868 {
3869   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
3870   int eol_conversion;
3871   Lisp_Object translation_table;
3872
3873   if (coding->type == coding_type_ccl
3874       || coding->type == coding_type_undecided
3875       || !NILP (coding->post_read_conversion))
3876     {
3877       /* We can't skip any data.  */
3878       return;
3879     }
3880   else if (coding->type == coding_type_no_conversion)
3881     {
3882       /* We need no conversion, but don't have to skip any data here.
3883          Decoding routine handles them effectively anyway.  */
3884       return;
3885     }
3886
3887   translation_table = coding->translation_table_for_decode;
3888   if (NILP (translation_table) && !NILP (Venable_character_translation))
3889     translation_table = Vstandard_translation_table_for_decode;
3890   if (CHAR_TABLE_P (translation_table))
3891     {
3892       int i;
3893       for (i = 0; i < 128; i++)
3894         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3895           break;
3896       if (i < 128)
3897         /* Some ASCII character should be tranlsated.  We give up
3898            shrinking.  */
3899         return;
3900     }
3901
3902   eol_conversion = (coding->eol_type != CODING_EOL_LF);
3903
3904   if ((! eol_conversion) && (coding->heading_ascii >= 0))
3905     /* Detection routine has already found how much we can skip at the
3906        head.  */
3907     *beg += coding->heading_ascii;
3908
3909   if (str)
3910     {
3911       begp_orig = begp = str + *beg;
3912       endp_orig = endp = str + *end;
3913     }
3914   else
3915     {
3916       begp_orig = begp = BYTE_POS_ADDR (*beg);
3917       endp_orig = endp = begp + *end - *beg;
3918     }
3919
3920   switch (coding->type)
3921     {
3922     case coding_type_emacs_mule:
3923     case coding_type_raw_text:
3924       if (eol_conversion)
3925         {
3926           if (coding->heading_ascii < 0)
3927             while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
3928           while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
3929             endp--;
3930           /* Do not consider LF as ascii if preceded by CR, since that
3931              confuses eol decoding. */
3932           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3933             endp++;
3934         }
3935       else
3936         begp = endp;
3937       break;
3938
3939     case coding_type_sjis:
3940     case coding_type_big5:
3941       /* We can skip all ASCII characters at the head.  */
3942       if (coding->heading_ascii < 0)
3943         {
3944           if (eol_conversion)
3945             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
3946           else
3947             while (begp < endp && *begp < 0x80) begp++;
3948         }
3949       /* We can skip all ASCII characters at the tail except for the
3950          second byte of SJIS or BIG5 code.  */
3951       if (eol_conversion)
3952         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
3953       else
3954         while (begp < endp && endp[-1] < 0x80) endp--;
3955       /* Do not consider LF as ascii if preceded by CR, since that
3956          confuses eol decoding. */
3957       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3958         endp++;
3959       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3960         endp++;
3961       break;
3962
3963     default:            /* i.e. case coding_type_iso2022: */
3964       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
3965         /* We can't skip any data.  */
3966         break;
3967       if (coding->heading_ascii < 0)
3968         {
3969           /* We can skip all ASCII characters at the head except for a
3970              few control codes.  */
3971           while (begp < endp && (c = *begp) < 0x80
3972                  && c != ISO_CODE_CR && c != ISO_CODE_SO
3973                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
3974                  && (!eol_conversion || c != ISO_CODE_LF))
3975             begp++;
3976         }
3977       switch (coding->category_idx)
3978         {
3979         case CODING_CATEGORY_IDX_ISO_8_1:
3980         case CODING_CATEGORY_IDX_ISO_8_2:
3981           /* We can skip all ASCII characters at the tail.  */
3982           if (eol_conversion)
3983             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
3984           else
3985             while (begp < endp && endp[-1] < 0x80) endp--;
3986           /* Do not consider LF as ascii if preceded by CR, since that
3987              confuses eol decoding. */
3988           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3989             endp++;
3990           break;
3991
3992         case CODING_CATEGORY_IDX_ISO_7:
3993         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
3994           {
3995             /* We can skip all charactes at the tail except for 8-bit
3996                codes and ESC and the following 2-byte at the tail.  */
3997             unsigned char *eight_bit = NULL;
3998
3999             if (eol_conversion)
4000               while (begp < endp
4001                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4002                 {
4003                   if (!eight_bit && c & 0x80) eight_bit = endp;
4004                   endp--;
4005                 }
4006             else
4007               while (begp < endp
4008                      && (c = endp[-1]) != ISO_CODE_ESC)
4009                 {
4010                   if (!eight_bit && c & 0x80) eight_bit = endp;
4011                   endp--;
4012                 }
4013             /* Do not consider LF as ascii if preceded by CR, since that
4014                confuses eol decoding. */
4015             if (begp < endp && endp < endp_orig
4016                 && endp[-1] == '\r' && endp[0] == '\n')
4017               endp++;
4018             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4019               {
4020                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4021                   /* This is an ASCII designation sequence.  We can
4022                      surely skip the tail.  But, if we have
4023                      encountered an 8-bit code, skip only the codes
4024                      after that.  */
4025                   endp = eight_bit ? eight_bit : endp + 2;
4026                 else
4027                   /* Hmmm, we can't skip the tail.  */
4028                   endp = endp_orig;
4029               }
4030             else if (eight_bit)
4031               endp = eight_bit;
4032           }
4033         }
4034     }
4035   *beg += begp - begp_orig;
4036   *end += endp - endp_orig;
4037   return;
4038 }
4039
4040 /* Like shrink_decoding_region but for encoding.  */
4041
4042 static void
4043 shrink_encoding_region (beg, end, coding, str)
4044      int *beg, *end;
4045      struct coding_system *coding;
4046      unsigned char *str;
4047 {
4048   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4049   int eol_conversion;
4050   Lisp_Object translation_table;
4051
4052   if (coding->type == coding_type_ccl)
4053     /* We can't skip any data.  */
4054     return;
4055   else if (coding->type == coding_type_no_conversion)
4056     {
4057       /* We need no conversion.  */
4058       *beg = *end;
4059       return;
4060     }
4061
4062   translation_table = coding->translation_table_for_encode;
4063   if (NILP (translation_table) && !NILP (Venable_character_translation))
4064     translation_table = Vstandard_translation_table_for_encode;
4065   if (CHAR_TABLE_P (translation_table))
4066     {
4067       int i;
4068       for (i = 0; i < 128; i++)
4069         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4070           break;
4071       if (i < 128)
4072         /* Some ASCII character should be tranlsated.  We give up
4073            shrinking.  */
4074         return;
4075     }
4076
4077   if (str)
4078     {
4079       begp_orig = begp = str + *beg;
4080       endp_orig = endp = str + *end;
4081     }
4082   else
4083     {
4084       begp_orig = begp = BYTE_POS_ADDR (*beg);
4085       endp_orig = endp = begp + *end - *beg;
4086     }
4087
4088   eol_conversion = (coding->eol_type == CODING_EOL_CR
4089                     || coding->eol_type == CODING_EOL_CRLF);
4090
4091   /* Here, we don't have to check coding->pre_write_conversion because
4092      the caller is expected to have handled it already.  */
4093   switch (coding->type)
4094     {
4095     case coding_type_undecided:
4096     case coding_type_emacs_mule:
4097     case coding_type_raw_text:
4098       if (eol_conversion)
4099         {
4100           while (begp < endp && *begp != '\n') begp++;
4101           while (begp < endp && endp[-1] != '\n') endp--;
4102         }
4103       else
4104         begp = endp;
4105       break;
4106
4107     case coding_type_iso2022:
4108       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4109         /* We can't skip any data.  */
4110         break;
4111       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4112         {
4113           unsigned char *bol = begp;
4114           while (begp < endp && *begp < 0x80)
4115             {
4116               begp++;
4117               if (begp[-1] == '\n')
4118                 bol = begp;
4119             }
4120           begp = bol;
4121           goto label_skip_tail;
4122         }
4123       /* fall down ... */
4124
4125     default:
4126       /* We can skip all ASCII characters at the head and tail.  */
4127       if (eol_conversion)
4128         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4129       else
4130         while (begp < endp && *begp < 0x80) begp++;
4131     label_skip_tail:
4132       if (eol_conversion)
4133         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4134       else
4135         while (begp < endp && *(endp - 1) < 0x80) endp--;
4136       break;
4137     }
4138
4139   *beg += begp - begp_orig;
4140   *end += endp - endp_orig;
4141   return;
4142 }
4143
4144 /* As shrinking conversion region requires some overhead, we don't try
4145    shrinking if the length of conversion region is less than this
4146    value.  */
4147 static int shrink_conversion_region_threshhold = 1024;
4148
4149 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4150   do {                                                                  \
4151     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4152       {                                                                 \
4153         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4154         else shrink_decoding_region (beg, end, coding, str);            \
4155       }                                                                 \
4156   } while (0)
4157
4158 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4159    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4160    coding system CODING, and return the status code of code conversion
4161    (currently, this value has no meaning).
4162
4163    How many characters (and bytes) are converted to how many
4164    characters (and bytes) are recorded in members of the structure
4165    CODING.
4166
4167    If REPLACE is nonzero, we do various things as if the original text
4168    is deleted and a new text is inserted.  See the comments in
4169    replace_range (insdel.c) to know what we are doing.  */
4170
4171 int
4172 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4173      int from, from_byte, to, to_byte, encodep, replace;
4174      struct coding_system *coding;
4175 {
4176   int len = to - from, len_byte = to_byte - from_byte;
4177   int require, inserted, inserted_byte;
4178   int head_skip, tail_skip, total_skip;
4179   Lisp_Object saved_coding_symbol;
4180   int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4181   int first = 1;
4182   int fake_multibyte = 0;
4183   unsigned char *src, *dst;
4184   Lisp_Object deletion;
4185   int orig_point = PT, orig_len = len;
4186   int prev_Z;
4187
4188   deletion = Qnil;
4189   saved_coding_symbol = Qnil;
4190
4191   if (from < PT && PT < to)
4192     {
4193       TEMP_SET_PT_BOTH (from, from_byte);
4194       orig_point = from;
4195     }
4196
4197   if (replace)
4198     {
4199       int saved_from = from;
4200
4201       prepare_to_modify_buffer (from, to, &from);
4202       if (saved_from != from)
4203         {
4204           to = from + len;
4205           if (multibyte)
4206             from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4207           else
4208             from_byte = from, to_byte = to;
4209           len_byte = to_byte - from_byte;
4210         }
4211     }
4212
4213   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4214     {
4215       /* We must detect encoding of text and eol format.  */
4216
4217       if (from < GPT && to > GPT)
4218         move_gap_both (from, from_byte);
4219       if (coding->type == coding_type_undecided)
4220         {
4221           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4222           if (coding->type == coding_type_undecided)
4223             /* It seems that the text contains only ASCII, but we
4224                should not left it undecided because the deeper
4225                decoding routine (decode_coding) tries to detect the
4226                encodings again in vain.  */
4227             coding->type = coding_type_emacs_mule;
4228         }
4229       if (coding->eol_type == CODING_EOL_UNDECIDED)
4230         {
4231           saved_coding_symbol = coding->symbol;
4232           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4233           if (coding->eol_type == CODING_EOL_UNDECIDED)
4234             coding->eol_type = CODING_EOL_LF;
4235           /* We had better recover the original eol format if we
4236              encounter an inconsitent eol format while decoding.  */
4237           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4238         }
4239     }
4240
4241   coding->consumed_char = len, coding->consumed = len_byte;
4242
4243   if (encodep
4244       ? ! CODING_REQUIRE_ENCODING (coding)
4245       : ! CODING_REQUIRE_DECODING (coding))
4246     {
4247       coding->produced = len_byte;
4248       if (multibyte
4249           && ! replace
4250           /* See the comment of the member heading_ascii in coding.h.  */
4251           && coding->heading_ascii < len_byte)
4252         {
4253           /* We still may have to combine byte at the head and the
4254              tail of the text in the region.  */
4255           if (from < GPT && GPT < to)
4256             move_gap_both (to, to_byte);
4257           len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4258           adjust_after_insert (from, from_byte, to, to_byte, len);
4259           coding->produced_char = len;
4260         }
4261       else
4262         {
4263           if (!replace)
4264             adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4265           coding->produced_char = len_byte;
4266         }
4267       return 0;
4268     }
4269
4270   /* Now we convert the text.  */
4271
4272   /* For encoding, we must process pre-write-conversion in advance.  */
4273   if (encodep
4274       && ! NILP (coding->pre_write_conversion)
4275       && SYMBOLP (coding->pre_write_conversion)
4276       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4277     {
4278       /* The function in pre-write-conversion may put a new text in a
4279          new buffer.  */
4280       struct buffer *prev = current_buffer;
4281       Lisp_Object new;
4282
4283       call2 (coding->pre_write_conversion,
4284              make_number (from), make_number (to));
4285       if (current_buffer != prev)
4286         {
4287           len = ZV - BEGV;
4288           new = Fcurrent_buffer ();
4289           set_buffer_internal_1 (prev);
4290           del_range_2 (from, from_byte, to, to_byte);
4291           TEMP_SET_PT_BOTH (from, from_byte);
4292           insert_from_buffer (XBUFFER (new), 1, len, 0);
4293           Fkill_buffer (new);
4294           if (orig_point >= to)
4295             orig_point += len - orig_len;
4296           else if (orig_point > from)
4297             orig_point = from;
4298           orig_len = len;
4299           to = from + len;
4300           from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
4301           to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4302           len_byte = to_byte - from_byte;
4303           TEMP_SET_PT_BOTH (from, from_byte);
4304         }
4305     }
4306
4307   if (replace)
4308     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4309
4310   /* Try to skip the heading and tailing ASCIIs.  */
4311   {
4312     int from_byte_orig = from_byte, to_byte_orig = to_byte;
4313
4314     if (from < GPT && GPT < to)
4315       move_gap_both (from, from_byte);
4316     SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4317     if (from_byte == to_byte
4318         && coding->type != coding_type_ccl
4319         && ! (coding->mode & CODING_MODE_LAST_BLOCK
4320               && CODING_REQUIRE_FLUSHING (coding)))
4321       {
4322         coding->produced = len_byte;
4323         coding->produced_char = multibyte ? len : len_byte;
4324         if (!replace)
4325           /* We must record and adjust for this new text now.  */
4326           adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4327         return 0;
4328       }
4329
4330     head_skip = from_byte - from_byte_orig;
4331     tail_skip = to_byte_orig - to_byte;
4332     total_skip = head_skip + tail_skip;
4333     from += head_skip;
4334     to -= tail_skip;
4335     len -= total_skip; len_byte -= total_skip;
4336   }
4337
4338   /* The code conversion routine can not preserve text properties for
4339      now.  So, we must remove all text properties in the region.
4340      Here, we must suppress all modification hooks.  */
4341   if (replace)
4342     {
4343       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4344       inhibit_modification_hooks = 1;
4345       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4346       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4347     }
4348
4349   /* For converion, we must put the gap before the text in addition to
4350      making the gap larger for efficient decoding.  The required gap
4351      size starts from 2000 which is the magic number used in make_gap.
4352      But, after one batch of conversion, it will be incremented if we
4353      find that it is not enough .  */
4354   require = 2000;
4355
4356   if (GAP_SIZE  < require)
4357     make_gap (require - GAP_SIZE);
4358   move_gap_both (from, from_byte);
4359
4360   inserted = inserted_byte = 0;
4361   src = GAP_END_ADDR, dst = GPT_ADDR;
4362
4363   GAP_SIZE += len_byte;
4364   ZV -= len;
4365   Z -= len;
4366   ZV_BYTE -= len_byte;
4367   Z_BYTE -= len_byte;
4368
4369   if (GPT - BEG < beg_unchanged)
4370     beg_unchanged = GPT - BEG;
4371   if (Z - GPT < end_unchanged)
4372     end_unchanged = Z - GPT;
4373
4374   for (;;)
4375     {
4376       int result;
4377
4378       /* The buffer memory is changed from:
4379          +--------+converted-text+---------+-------original-text------+---+
4380          |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4381                   |<------------------- GAP_SIZE -------------------->|  */
4382       if (encodep)
4383         result = encode_coding (coding, src, dst, len_byte, 0);
4384       else
4385         result = decode_coding (coding, src, dst, len_byte, 0);
4386       /* to:
4387          +--------+-------converted-text--------+--+---original-text--+---+
4388          |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4389                   |<------------------- GAP_SIZE -------------------->|  */
4390       if (coding->fake_multibyte)
4391         fake_multibyte = 1;
4392
4393       if (!encodep && !multibyte)
4394         coding->produced_char = coding->produced;
4395       inserted += coding->produced_char;
4396       inserted_byte += coding->produced;
4397       len_byte -= coding->consumed;
4398       src += coding->consumed;
4399       dst += inserted_byte;
4400
4401       if (result == CODING_FINISH_NORMAL)
4402         {
4403           src += len_byte;
4404           break;
4405         }
4406       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4407         {
4408           unsigned char *pend = dst, *p = pend - inserted_byte;
4409
4410           /* Encode LFs back to the original eol format (CR or CRLF).  */
4411           if (coding->eol_type == CODING_EOL_CR)
4412             {
4413               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4414             }
4415           else
4416             {
4417               int count = 0;
4418
4419               while (p < pend) if (*p++ == '\n') count++;
4420               if (src - dst < count)
4421                 {
4422                   /* We don't have sufficient room for putting LFs
4423                      back to CRLF.  We must record converted and
4424                      not-yet-converted text back to the buffer
4425                      content, enlarge the gap, then record them out of
4426                      the buffer contents again.  */
4427                   int add = len_byte + inserted_byte;
4428
4429                   GAP_SIZE -= add;
4430                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4431                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4432                   make_gap (count - GAP_SIZE);
4433                   GAP_SIZE += add;
4434                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4435                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4436                   /* Don't forget to update SRC, DST, and PEND.  */
4437                   src = GAP_END_ADDR - len_byte;
4438                   dst = GPT_ADDR + inserted_byte;
4439                   pend = dst;
4440                 }
4441               inserted += count;
4442               inserted_byte += count;
4443               coding->produced += count;
4444               p = dst = pend + count;
4445               while (count)
4446                 {
4447                   *--p = *--pend;
4448                   if (*p == '\n') count--, *--p = '\r';
4449                 }
4450             }
4451
4452           /* Suppress eol-format conversion in the further conversion.  */
4453           coding->eol_type = CODING_EOL_LF;
4454
4455           /* Restore the original symbol.  */
4456           coding->symbol = saved_coding_symbol;
4457
4458           continue;
4459         }
4460       if (len_byte <= 0)
4461         {
4462           if (coding->type != coding_type_ccl
4463               || coding->mode & CODING_MODE_LAST_BLOCK)
4464             break;
4465           coding->mode |= CODING_MODE_LAST_BLOCK;
4466           continue;
4467         }
4468       if (result == CODING_FINISH_INSUFFICIENT_SRC)
4469         {
4470           /* The source text ends in invalid codes.  Let's just
4471              make them valid buffer contents, and finish conversion.  */
4472           inserted += len_byte;
4473           inserted_byte += len_byte;
4474           while (len_byte--)
4475             *dst++ = *src++;
4476           fake_multibyte = 1;
4477           break;
4478         }
4479       if (result == CODING_FINISH_INTERRUPT)
4480         {
4481           /* The conversion procedure was interrupted by a user.  */
4482           fake_multibyte = 1;
4483           break;
4484         }
4485       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
4486       if (coding->consumed < 1)
4487         {
4488           /* It's quite strange to require more memory without
4489              consuming any bytes.  Perhaps CCL program bug.  */
4490           fake_multibyte = 1;
4491           break;
4492         }
4493       if (first)
4494         {
4495           /* We have just done the first batch of conversion which was
4496              stoped because of insufficient gap.  Let's reconsider the
4497              required gap size (i.e. SRT - DST) now.
4498
4499              We have converted ORIG bytes (== coding->consumed) into
4500              NEW bytes (coding->produced).  To convert the remaining
4501              LEN bytes, we may need REQUIRE bytes of gap, where:
4502                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4503                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4504              Here, we are sure that NEW >= ORIG.  */
4505           float ratio = coding->produced - coding->consumed;
4506           ratio /= coding->consumed;
4507           require = len_byte * ratio;
4508           first = 0;
4509         }
4510       if ((src - dst) < (require + 2000))
4511         {
4512           /* See the comment above the previous call of make_gap.  */
4513           int add = len_byte + inserted_byte;
4514
4515           GAP_SIZE -= add;
4516           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4517           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4518           make_gap (require + 2000);
4519           GAP_SIZE += add;
4520           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4521           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4522           /* Don't forget to update SRC, DST.  */
4523           src = GAP_END_ADDR - len_byte;
4524           dst = GPT_ADDR + inserted_byte;
4525         }
4526     }
4527   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
4528
4529   if (multibyte
4530       && (encodep
4531           || fake_multibyte
4532           || (to - from) != (to_byte - from_byte)))
4533     inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
4534
4535   /* If we have shrinked the conversion area, adjust it now.  */
4536   if (total_skip > 0)
4537     {
4538       if (tail_skip > 0)
4539         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4540       inserted += total_skip; inserted_byte += total_skip;
4541       GAP_SIZE += total_skip;
4542       GPT -= head_skip; GPT_BYTE -= head_skip;
4543       ZV -= total_skip; ZV_BYTE -= total_skip;
4544       Z -= total_skip; Z_BYTE -= total_skip;
4545       from -= head_skip; from_byte -= head_skip;
4546       to += tail_skip; to_byte += tail_skip;
4547     }
4548
4549   prev_Z = Z;
4550   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4551   inserted = Z - prev_Z;
4552
4553   if (! encodep && ! NILP (coding->post_read_conversion))
4554     {
4555       Lisp_Object val;
4556
4557       if (from != PT)
4558         TEMP_SET_PT_BOTH (from, from_byte);
4559       prev_Z = Z;
4560       val = call1 (coding->post_read_conversion, make_number (inserted));
4561       CHECK_NUMBER (val, 0);
4562       inserted += Z - prev_Z;
4563     }
4564
4565   if (orig_point >= from)
4566     {
4567       if (orig_point >= from + orig_len)
4568         orig_point += inserted - orig_len;
4569       else
4570         orig_point = from;
4571       TEMP_SET_PT (orig_point);
4572     }
4573
4574   signal_after_change (from, to - from, inserted);
4575
4576   {
4577     coding->consumed = to_byte - from_byte;
4578     coding->consumed_char = to - from;
4579     coding->produced = inserted_byte;
4580     coding->produced_char = inserted;
4581   }
4582
4583   return 0;
4584 }
4585
4586 Lisp_Object
4587 code_convert_string (str, coding, encodep, nocopy)
4588      Lisp_Object str;
4589      struct coding_system *coding;
4590      int encodep, nocopy;
4591 {
4592   int len;
4593   char *buf;
4594   int from = 0, to = XSTRING (str)->size;
4595   int to_byte = STRING_BYTES (XSTRING (str));
4596   struct gcpro gcpro1;
4597   Lisp_Object saved_coding_symbol;
4598   int result;
4599
4600   saved_coding_symbol = Qnil;
4601   if (encodep && !NILP (coding->pre_write_conversion)
4602       || !encodep && !NILP (coding->post_read_conversion))
4603     {
4604       /* Since we have to call Lisp functions which assume target text
4605          is in a buffer, after setting a temporary buffer, call
4606          code_convert_region.  */
4607       int count = specpdl_ptr - specpdl;
4608       struct buffer *prev = current_buffer;
4609
4610       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4611       temp_output_buffer_setup (" *code-converting-work*");
4612       set_buffer_internal (XBUFFER (Vstandard_output));
4613       if (encodep)
4614         insert_from_string (str, 0, 0, to, to_byte, 0);
4615       else
4616         {
4617           /* We must insert the contents of STR as is without
4618              unibyte<->multibyte conversion.  */
4619           current_buffer->enable_multibyte_characters = Qnil;
4620           insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4621           current_buffer->enable_multibyte_characters = Qt;
4622         }
4623       code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
4624       if (encodep)
4625         /* We must return the buffer contents as unibyte string.  */
4626         current_buffer->enable_multibyte_characters = Qnil;
4627       str = make_buffer_string (BEGV, ZV, 0);
4628       set_buffer_internal (prev);
4629       return unbind_to (count, str);
4630     }
4631
4632   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4633     {
4634       /* See the comments in code_convert_region.  */
4635       if (coding->type == coding_type_undecided)
4636         {
4637           detect_coding (coding, XSTRING (str)->data, to_byte);
4638           if (coding->type == coding_type_undecided)
4639             coding->type = coding_type_emacs_mule;
4640         }
4641       if (coding->eol_type == CODING_EOL_UNDECIDED)
4642         {
4643           saved_coding_symbol = coding->symbol;
4644           detect_eol (coding, XSTRING (str)->data, to_byte);
4645           if (coding->eol_type == CODING_EOL_UNDECIDED)
4646             coding->eol_type = CODING_EOL_LF;
4647           /* We had better recover the original eol format if we
4648              encounter an inconsitent eol format while decoding.  */
4649           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4650         }
4651     }
4652
4653   if (encodep
4654       ? ! CODING_REQUIRE_ENCODING (coding)
4655       : ! CODING_REQUIRE_DECODING (coding))
4656     from = to_byte;
4657   else
4658     {
4659       /* Try to skip the heading and tailing ASCIIs.  */
4660       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4661                                 encodep);
4662     }
4663   if (from == to_byte
4664       && coding->type != coding_type_ccl)
4665     return (nocopy ? str : Fcopy_sequence (str));
4666
4667   if (encodep)
4668     len = encoding_buffer_size (coding, to_byte - from);
4669   else
4670     len = decoding_buffer_size (coding, to_byte - from);
4671   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
4672   GCPRO1 (str);
4673   buf = get_conversion_buffer (len);
4674   UNGCPRO;
4675
4676   if (from > 0)
4677     bcopy (XSTRING (str)->data, buf, from);
4678   result = (encodep
4679             ? encode_coding (coding, XSTRING (str)->data + from,
4680                              buf + from, to_byte - from, len)
4681             : decode_coding (coding, XSTRING (str)->data + from,
4682                              buf + from, to_byte - from, len));
4683   if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4684     {
4685       /* We simple try to decode the whole string again but without
4686          eol-conversion this time.  */
4687       coding->eol_type = CODING_EOL_LF;
4688       coding->symbol = saved_coding_symbol;
4689       return code_convert_string (str, coding, encodep, nocopy);
4690     }
4691
4692   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
4693          STRING_BYTES (XSTRING (str)) - to_byte);
4694
4695   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
4696   if (encodep)
4697     str = make_unibyte_string (buf, len + coding->produced);
4698   else
4699     {
4700       int chars= (coding->fake_multibyte
4701                   ? multibyte_chars_in_text (buf + from, coding->produced)
4702                   : coding->produced_char);
4703       str = make_multibyte_string (buf, len + chars, len + coding->produced);
4704     }
4705
4706   return str;
4707 }
4708
4709 \f
4710 #ifdef emacs
4711 /*** 8. Emacs Lisp library functions ***/
4712
4713 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4714   "Return t if OBJECT is nil or a coding-system.\n\
4715 See the documentation of `make-coding-system' for information\n\
4716 about coding-system objects.")
4717   (obj)
4718      Lisp_Object obj;
4719 {
4720   if (NILP (obj))
4721     return Qt;
4722   if (!SYMBOLP (obj))
4723     return Qnil;
4724   /* Get coding-spec vector for OBJ.  */
4725   obj = Fget (obj, Qcoding_system);
4726   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4727           ? Qt : Qnil);
4728 }
4729
4730 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4731        Sread_non_nil_coding_system, 1, 1, 0,
4732   "Read a coding system from the minibuffer, prompting with string PROMPT.")
4733   (prompt)
4734      Lisp_Object prompt;
4735 {
4736   Lisp_Object val;
4737   do
4738     {
4739       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4740                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
4741     }
4742   while (XSTRING (val)->size == 0);
4743   return (Fintern (val, Qnil));
4744 }
4745
4746 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4747   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4748 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4749   (prompt, default_coding_system)
4750      Lisp_Object prompt, default_coding_system;
4751 {
4752   Lisp_Object val;
4753   if (SYMBOLP (default_coding_system))
4754     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4755   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4756                           Qt, Qnil, Qcoding_system_history,
4757                           default_coding_system, Qnil);
4758   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4759 }
4760
4761 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4762        1, 1, 0,
4763   "Check validity of CODING-SYSTEM.\n\
4764 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4765 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4766 The value of property should be a vector of length 5.")
4767   (coding_system)
4768      Lisp_Object coding_system;
4769 {
4770   CHECK_SYMBOL (coding_system, 0);
4771   if (!NILP (Fcoding_system_p (coding_system)))
4772     return coding_system;
4773   while (1)
4774     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4775 }
4776 \f
4777 Lisp_Object
4778 detect_coding_system (src, src_bytes, highest)
4779      unsigned char *src;
4780      int src_bytes, highest;
4781 {
4782   int coding_mask, eol_type;
4783   Lisp_Object val, tmp;
4784   int dummy;
4785
4786   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4787   eol_type  = detect_eol_type (src, src_bytes, &dummy);
4788   if (eol_type == CODING_EOL_INCONSISTENT)
4789     eol_type = CODING_EOL_UNDECIDED;
4790
4791   if (!coding_mask)
4792     {
4793       val = Qundecided;
4794       if (eol_type != CODING_EOL_UNDECIDED)
4795         {
4796           Lisp_Object val2;
4797           val2 = Fget (Qundecided, Qeol_type);
4798           if (VECTORP (val2))
4799             val = XVECTOR (val2)->contents[eol_type];
4800         }
4801       return (highest ? val : Fcons (val, Qnil));
4802     }
4803
4804   /* At first, gather possible coding systems in VAL.  */
4805   val = Qnil;
4806   for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4807     {
4808       int idx
4809         = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4810       if (coding_mask & (1 << idx))
4811         {
4812           val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4813           if (highest)
4814             break;
4815         }
4816     }
4817   if (!highest)
4818     val = Fnreverse (val);
4819
4820   /* Then, replace the elements with subsidiary coding systems.  */
4821   for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4822     {
4823       if (eol_type != CODING_EOL_UNDECIDED
4824           && eol_type != CODING_EOL_INCONSISTENT)
4825         {
4826           Lisp_Object eol;
4827           eol = Fget (XCONS (tmp)->car, Qeol_type);
4828           if (VECTORP (eol))
4829             XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4830         }
4831     }
4832   return (highest ? XCONS (val)->car : val);
4833 }
4834
4835 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4836        2, 3, 0,
4837   "Detect coding system of the text in the region between START and END.\n\
4838 Return a list of possible coding systems ordered by priority.\n\
4839 \n\
4840 If only ASCII characters are found, it returns a list of single element\n\
4841 `undecided' or its subsidiary coding system according to a detected\n\
4842 end-of-line format.\n\
4843 \n\
4844 If optional argument HIGHEST is non-nil, return the coding system of\n\
4845 highest priority.")
4846   (start, end, highest)
4847      Lisp_Object start, end, highest;
4848 {
4849   int from, to;
4850   int from_byte, to_byte;
4851
4852   CHECK_NUMBER_COERCE_MARKER (start, 0);
4853   CHECK_NUMBER_COERCE_MARKER (end, 1);
4854
4855   validate_region (&start, &end);
4856   from = XINT (start), to = XINT (end);
4857   from_byte = CHAR_TO_BYTE (from);
4858   to_byte = CHAR_TO_BYTE (to);
4859
4860   if (from < GPT && to >= GPT)
4861     move_gap_both (to, to_byte);
4862
4863   return detect_coding_system (BYTE_POS_ADDR (from_byte),
4864                                to_byte - from_byte,
4865                                !NILP (highest));
4866 }
4867
4868 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4869        1, 2, 0,
4870   "Detect coding system of the text in STRING.\n\
4871 Return a list of possible coding systems ordered by priority.\n\
4872 \n\
4873 If only ASCII characters are found, it returns a list of single element\n\
4874 `undecided' or its subsidiary coding system according to a detected\n\
4875 end-of-line format.\n\
4876 \n\
4877 If optional argument HIGHEST is non-nil, return the coding system of\n\
4878 highest priority.")
4879   (string, highest)
4880      Lisp_Object string, highest;
4881 {
4882   CHECK_STRING (string, 0);
4883
4884   return detect_coding_system (XSTRING (string)->data,
4885                                STRING_BYTES (XSTRING (string)),
4886                                !NILP (highest));
4887 }
4888
4889 Lisp_Object
4890 code_convert_region1 (start, end, coding_system, encodep)
4891      Lisp_Object start, end, coding_system;
4892      int encodep;
4893 {
4894   struct coding_system coding;
4895   int from, to, len;
4896
4897   CHECK_NUMBER_COERCE_MARKER (start, 0);
4898   CHECK_NUMBER_COERCE_MARKER (end, 1);
4899   CHECK_SYMBOL (coding_system, 2);
4900
4901   validate_region (&start, &end);
4902   from = XFASTINT (start);
4903   to = XFASTINT (end);
4904
4905   if (NILP (coding_system))
4906     return make_number (to - from);
4907
4908   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4909     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4910
4911   coding.mode |= CODING_MODE_LAST_BLOCK;
4912   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4913                        &coding, encodep, 1);
4914   Vlast_coding_system_used = coding.symbol;
4915   return make_number (coding.produced_char);
4916 }
4917
4918 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4919        3, 3, "r\nzCoding system: ",
4920   "Decode the current region by specified coding system.\n\
4921 When called from a program, takes three arguments:\n\
4922 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
4923 This function sets `last-coding-system-used' to the precise coding system\n\
4924 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4925 not fully specified.)\n\
4926 It returns the length of the decoded text.")
4927   (start, end, coding_system)
4928      Lisp_Object start, end, coding_system;
4929 {
4930   return code_convert_region1 (start, end, coding_system, 0);
4931 }
4932
4933 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4934        3, 3, "r\nzCoding system: ",
4935   "Encode the current region by specified coding system.\n\
4936 When called from a program, takes three arguments:\n\
4937 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
4938 This function sets `last-coding-system-used' to the precise coding system\n\
4939 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4940 not fully specified.)\n\
4941 It returns the length of the encoded text.")
4942   (start, end, coding_system)
4943      Lisp_Object start, end, coding_system;
4944 {
4945   return code_convert_region1 (start, end, coding_system, 1);
4946 }
4947
4948 Lisp_Object
4949 code_convert_string1 (string, coding_system, nocopy, encodep)
4950      Lisp_Object string, coding_system, nocopy;
4951      int encodep;
4952 {
4953   struct coding_system coding;
4954
4955   CHECK_STRING (string, 0);
4956   CHECK_SYMBOL (coding_system, 1);
4957
4958   if (NILP (coding_system))
4959     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4960
4961   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4962     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4963
4964   coding.mode |= CODING_MODE_LAST_BLOCK;
4965   Vlast_coding_system_used = coding.symbol;
4966   return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4967 }
4968
4969 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
4970        2, 3, 0,
4971   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4972 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4973 if the decoding operation is trivial.\n\
4974 This function sets `last-coding-system-used' to the precise coding system\n\
4975 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4976 not fully specified.)")
4977   (string, coding_system, nocopy)
4978      Lisp_Object string, coding_system, nocopy;
4979 {
4980   return code_convert_string1 (string, coding_system, nocopy, 0);
4981 }
4982
4983 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
4984        2, 3, 0,
4985   "Encode STRING to CODING-SYSTEM, and return the result.\n\
4986 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4987 if the encoding operation is trivial.\n\
4988 This function sets `last-coding-system-used' to the precise coding system\n\
4989 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4990 not fully specified.)")
4991   (string, coding_system, nocopy)
4992      Lisp_Object string, coding_system, nocopy;
4993 {
4994   return code_convert_string1 (string, coding_system, nocopy, 1);
4995 }
4996
4997 /* Encode or decode STRING according to CODING_SYSTEM.
4998    Do not set Vlast_coding_system_used.  */
4999
5000 Lisp_Object
5001 code_convert_string_norecord (string, coding_system, encodep)
5002      Lisp_Object string, coding_system;
5003      int encodep;
5004 {
5005   struct coding_system coding;
5006
5007   CHECK_STRING (string, 0);
5008   CHECK_SYMBOL (coding_system, 1);
5009
5010   if (NILP (coding_system))
5011     return string;
5012
5013   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5014     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5015
5016   coding.mode |= CODING_MODE_LAST_BLOCK;
5017   return code_convert_string (string, &coding, encodep, Qt);
5018 }
5019 \f
5020 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5021   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5022 Return the corresponding character.")
5023   (code)
5024      Lisp_Object code;
5025 {
5026   unsigned char c1, c2, s1, s2;
5027   Lisp_Object val;
5028
5029   CHECK_NUMBER (code, 0);
5030   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5031   if (s1 == 0)
5032     {
5033       if (s2 < 0xA0 || s2 > 0xDF)
5034         error ("Invalid Shift JIS code: %s", XFASTINT (code));
5035       XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5036     }
5037   else
5038     {
5039       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5040           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5041         error ("Invalid Shift JIS code: %s", XFASTINT (code));
5042       DECODE_SJIS (s1, s2, c1, c2);
5043       XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5044     }
5045   return val;
5046 }
5047
5048 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5049   "Encode a Japanese character CHAR to shift_jis encoding.\n\
5050 Return the corresponding code in SJIS.")
5051   (ch)
5052      Lisp_Object ch;
5053 {
5054   int charset, c1, c2, s1, s2;
5055   Lisp_Object val;
5056
5057   CHECK_NUMBER (ch, 0);
5058   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5059   if (charset == charset_jisx0208
5060       && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5061     {
5062       ENCODE_SJIS (c1, c2, s1, s2);
5063       XSETFASTINT (val, (s1 << 8) | s2);
5064     }
5065   else if (charset == charset_katakana_jisx0201
5066            && c1 > 0x20 && c2 < 0xE0)
5067     {
5068       XSETFASTINT (val, c1 | 0x80);
5069     }
5070   else
5071     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5072
5073   return val;
5074 }
5075
5076 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5077   "Decode a Big5 character CODE of BIG5 coding system.\n\
5078 CODE is the character code in BIG5.\n\
5079 Return the corresponding character.")
5080   (code)
5081      Lisp_Object code;
5082 {
5083   int charset;
5084   unsigned char b1, b2, c1, c2;
5085   Lisp_Object val;
5086
5087   CHECK_NUMBER (code, 0);
5088   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5089   DECODE_BIG5 (b1, b2, charset, c1, c2);
5090   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5091   return val;
5092 }
5093
5094 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5095   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5096 Return the corresponding character code in Big5.")
5097   (ch)
5098      Lisp_Object ch;
5099 {
5100   int charset, c1, c2, b1, b2;
5101   Lisp_Object val;
5102
5103   CHECK_NUMBER (ch, 0);
5104   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5105   if (charset == charset_big5_1 || charset == charset_big5_2)
5106     {
5107       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5108       XSETFASTINT (val, (b1 << 8) | b2);
5109     }
5110   else
5111     XSETFASTINT (val, 0);
5112   return val;
5113 }
5114 \f
5115 DEFUN ("set-terminal-coding-system-internal",
5116        Fset_terminal_coding_system_internal,
5117        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5118   (coding_system)
5119      Lisp_Object coding_system;
5120 {
5121   CHECK_SYMBOL (coding_system, 0);
5122   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5123   /* We had better not send unsafe characters to terminal.  */
5124   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5125
5126   return Qnil;
5127 }
5128
5129 DEFUN ("set-safe-terminal-coding-system-internal",
5130        Fset_safe_terminal_coding_system_internal,
5131        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5132   (coding_system)
5133      Lisp_Object coding_system;
5134 {
5135   CHECK_SYMBOL (coding_system, 0);
5136   setup_coding_system (Fcheck_coding_system (coding_system),
5137                        &safe_terminal_coding);
5138   return Qnil;
5139 }
5140
5141 DEFUN ("terminal-coding-system",
5142        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5143   "Return coding system specified for terminal output.")
5144   ()
5145 {
5146   return terminal_coding.symbol;
5147 }
5148
5149 DEFUN ("set-keyboard-coding-system-internal",
5150        Fset_keyboard_coding_system_internal,
5151        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5152   (coding_system)
5153      Lisp_Object coding_system;
5154 {
5155   CHECK_SYMBOL (coding_system, 0);
5156   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5157   return Qnil;
5158 }
5159
5160 DEFUN ("keyboard-coding-system",
5161        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5162   "Return coding system specified for decoding keyboard input.")
5163   ()
5164 {
5165   return keyboard_coding.symbol;
5166 }
5167
5168 \f
5169 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5170        Sfind_operation_coding_system,  1, MANY, 0,
5171   "Choose a coding system for an operation based on the target name.\n\
5172 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5173 DECODING-SYSTEM is the coding system to use for decoding\n\
5174 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5175 for encoding (in case OPERATION does encoding).\n\
5176 \n\
5177 The first argument OPERATION specifies an I/O primitive:\n\
5178   For file I/O, `insert-file-contents' or `write-region'.\n\
5179   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5180   For network I/O, `open-network-stream'.\n\
5181 \n\
5182 The remaining arguments should be the same arguments that were passed\n\
5183 to the primitive.  Depending on which primitive, one of those arguments\n\
5184 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5185 whichever argument specifies the file name is TARGET.\n\
5186 \n\
5187 TARGET has a meaning which depends on OPERATION:\n\
5188   For file I/O, TARGET is a file name.\n\
5189   For process I/O, TARGET is a process name.\n\
5190   For network I/O, TARGET is a service name or a port number\n\
5191 \n\
5192 This function looks up what specified for TARGET in,\n\
5193 `file-coding-system-alist', `process-coding-system-alist',\n\
5194 or `network-coding-system-alist' depending on OPERATION.\n\
5195 They may specify a coding system, a cons of coding systems,\n\
5196 or a function symbol to call.\n\
5197 In the last case, we call the function with one argument,\n\
5198 which is a list of all the arguments given to this function.")
5199   (nargs, args)
5200      int nargs;
5201      Lisp_Object *args;
5202 {
5203   Lisp_Object operation, target_idx, target, val;
5204   register Lisp_Object chain;
5205
5206   if (nargs < 2)
5207     error ("Too few arguments");
5208   operation = args[0];
5209   if (!SYMBOLP (operation)
5210       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5211     error ("Invalid first arguement");
5212   if (nargs < 1 + XINT (target_idx))
5213     error ("Too few arguments for operation: %s",
5214            XSYMBOL (operation)->name->data);
5215   target = args[XINT (target_idx) + 1];
5216   if (!(STRINGP (target)
5217         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5218     error ("Invalid %dth argument", XINT (target_idx) + 1);
5219
5220   chain = ((EQ (operation, Qinsert_file_contents)
5221             || EQ (operation, Qwrite_region))
5222            ? Vfile_coding_system_alist
5223            : (EQ (operation, Qopen_network_stream)
5224               ? Vnetwork_coding_system_alist
5225               : Vprocess_coding_system_alist));
5226   if (NILP (chain))
5227     return Qnil;
5228
5229   for (; CONSP (chain); chain = XCONS (chain)->cdr)
5230     {
5231       Lisp_Object elt;
5232       elt = XCONS (chain)->car;
5233
5234       if (CONSP (elt)
5235           && ((STRINGP (target)
5236                && STRINGP (XCONS (elt)->car)
5237                && fast_string_match (XCONS (elt)->car, target) >= 0)
5238               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
5239         {
5240           val = XCONS (elt)->cdr;
5241           /* Here, if VAL is both a valid coding system and a valid
5242              function symbol, we return VAL as a coding system.  */
5243           if (CONSP (val))
5244             return val;
5245           if (! SYMBOLP (val))
5246             return Qnil;
5247           if (! NILP (Fcoding_system_p (val)))
5248             return Fcons (val, val);
5249           if (! NILP (Ffboundp (val)))
5250             {
5251               val = call1 (val, Flist (nargs, args));
5252               if (CONSP (val))
5253                 return val;
5254               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5255                 return Fcons (val, val);
5256             }
5257           return Qnil;
5258         }
5259     }
5260   return Qnil;
5261 }
5262
5263 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5264        Supdate_coding_systems_internal, 0, 0, 0,
5265   "Update internal database for ISO2022 and CCL based coding systems.\n\
5266 When values of the following coding categories are changed, you must\n\
5267 call this function:\n\
5268   coding-category-iso-7, coding-category-iso-7-tight,\n\
5269   coding-category-iso-8-1, coding-category-iso-8-2,\n\
5270   coding-category-iso-7-else, coding-category-iso-8-else,\n\
5271   coding-category-ccl")
5272   ()
5273 {
5274   int i;
5275
5276   for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
5277     {
5278       Lisp_Object val;
5279
5280       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5281       if (!NILP (val))
5282         {
5283           if (! coding_system_table[i])
5284             coding_system_table[i] = ((struct coding_system *)
5285                                       xmalloc (sizeof (struct coding_system)));
5286           setup_coding_system (val, coding_system_table[i]);
5287         }
5288       else if (coding_system_table[i])
5289         {
5290           xfree (coding_system_table[i]);
5291           coding_system_table[i] = NULL;
5292         }
5293     }
5294
5295   return Qnil;
5296 }
5297
5298 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5299        Sset_coding_priority_internal, 0, 0, 0,
5300   "Update internal database for the current value of `coding-category-list'.\n\
5301 This function is internal use only.")
5302   ()
5303 {
5304   int i = 0, idx;
5305   Lisp_Object val;
5306
5307   val = Vcoding_category_list;
5308
5309   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5310     {
5311       if (! SYMBOLP (XCONS (val)->car))
5312         break;
5313       idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
5314       if (idx >= CODING_CATEGORY_IDX_MAX)
5315         break;
5316       coding_priorities[i++] = (1 << idx);
5317       val = XCONS (val)->cdr;
5318     }
5319   /* If coding-category-list is valid and contains all coding
5320      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
5321      the following code saves Emacs from craching.  */
5322   while (i < CODING_CATEGORY_IDX_MAX)
5323     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5324
5325   return Qnil;
5326 }
5327
5328 #endif /* emacs */
5329
5330 \f
5331 /*** 9. Post-amble ***/
5332
5333 void
5334 init_coding ()
5335 {
5336   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5337 }
5338
5339 void
5340 init_coding_once ()
5341 {
5342   int i;
5343
5344   /* Emacs' internal format specific initialize routine.  */
5345   for (i = 0; i <= 0x20; i++)
5346     emacs_code_class[i] = EMACS_control_code;
5347   emacs_code_class[0x0A] = EMACS_linefeed_code;
5348   emacs_code_class[0x0D] = EMACS_carriage_return_code;
5349   for (i = 0x21 ; i < 0x7F; i++)
5350     emacs_code_class[i] = EMACS_ascii_code;
5351   emacs_code_class[0x7F] = EMACS_control_code;
5352   emacs_code_class[0x80] = EMACS_leading_code_composition;
5353   for (i = 0x81; i < 0xFF; i++)
5354     emacs_code_class[i] = EMACS_invalid_code;
5355   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5356   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5357   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5358   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5359
5360   /* ISO2022 specific initialize routine.  */
5361   for (i = 0; i < 0x20; i++)
5362     iso_code_class[i] = ISO_control_code;
5363   for (i = 0x21; i < 0x7F; i++)
5364     iso_code_class[i] = ISO_graphic_plane_0;
5365   for (i = 0x80; i < 0xA0; i++)
5366     iso_code_class[i] = ISO_control_code;
5367   for (i = 0xA1; i < 0xFF; i++)
5368     iso_code_class[i] = ISO_graphic_plane_1;
5369   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5370   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5371   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5372   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5373   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5374   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5375   iso_code_class[ISO_CODE_ESC] = ISO_escape;
5376   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5377   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5378   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5379
5380   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5381
5382   setup_coding_system (Qnil, &keyboard_coding);
5383   setup_coding_system (Qnil, &terminal_coding);
5384   setup_coding_system (Qnil, &safe_terminal_coding);
5385   setup_coding_system (Qnil, &default_buffer_file_coding);
5386
5387   bzero (coding_system_table, sizeof coding_system_table);
5388
5389   bzero (ascii_skip_code, sizeof ascii_skip_code);
5390   for (i = 0; i < 128; i++)
5391     ascii_skip_code[i] = 1;
5392
5393 #if defined (MSDOS) || defined (WINDOWSNT)
5394   system_eol_type = CODING_EOL_CRLF;
5395 #else
5396   system_eol_type = CODING_EOL_LF;
5397 #endif
5398 }
5399
5400 #ifdef emacs
5401
5402 void
5403 syms_of_coding ()
5404 {
5405   Qtarget_idx = intern ("target-idx");
5406   staticpro (&Qtarget_idx);
5407
5408   Qcoding_system_history = intern ("coding-system-history");
5409   staticpro (&Qcoding_system_history);
5410   Fset (Qcoding_system_history, Qnil);
5411
5412   /* Target FILENAME is the first argument.  */
5413   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5414   /* Target FILENAME is the third argument.  */
5415   Fput (Qwrite_region, Qtarget_idx, make_number (2));
5416
5417   Qcall_process = intern ("call-process");
5418   staticpro (&Qcall_process);
5419   /* Target PROGRAM is the first argument.  */
5420   Fput (Qcall_process, Qtarget_idx, make_number (0));
5421
5422   Qcall_process_region = intern ("call-process-region");
5423   staticpro (&Qcall_process_region);
5424   /* Target PROGRAM is the third argument.  */
5425   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5426
5427   Qstart_process = intern ("start-process");
5428   staticpro (&Qstart_process);
5429   /* Target PROGRAM is the third argument.  */
5430   Fput (Qstart_process, Qtarget_idx, make_number (2));
5431
5432   Qopen_network_stream = intern ("open-network-stream");
5433   staticpro (&Qopen_network_stream);
5434   /* Target SERVICE is the fourth argument.  */
5435   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5436
5437   Qcoding_system = intern ("coding-system");
5438   staticpro (&Qcoding_system);
5439
5440   Qeol_type = intern ("eol-type");
5441   staticpro (&Qeol_type);
5442
5443   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5444   staticpro (&Qbuffer_file_coding_system);
5445
5446   Qpost_read_conversion = intern ("post-read-conversion");
5447   staticpro (&Qpost_read_conversion);
5448
5449   Qpre_write_conversion = intern ("pre-write-conversion");
5450   staticpro (&Qpre_write_conversion);
5451
5452   Qno_conversion = intern ("no-conversion");
5453   staticpro (&Qno_conversion);
5454
5455   Qundecided = intern ("undecided");
5456   staticpro (&Qundecided);
5457
5458   Qcoding_system_p = intern ("coding-system-p");
5459   staticpro (&Qcoding_system_p);
5460
5461   Qcoding_system_error = intern ("coding-system-error");
5462   staticpro (&Qcoding_system_error);
5463
5464   Fput (Qcoding_system_error, Qerror_conditions,
5465         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5466   Fput (Qcoding_system_error, Qerror_message,
5467         build_string ("Invalid coding system"));
5468
5469   Qcoding_category = intern ("coding-category");
5470   staticpro (&Qcoding_category);
5471   Qcoding_category_index = intern ("coding-category-index");
5472   staticpro (&Qcoding_category_index);
5473
5474   Vcoding_category_table
5475     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5476   staticpro (&Vcoding_category_table);
5477   {
5478     int i;
5479     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5480       {
5481         XVECTOR (Vcoding_category_table)->contents[i]
5482           = intern (coding_category_name[i]);
5483         Fput (XVECTOR (Vcoding_category_table)->contents[i],
5484               Qcoding_category_index, make_number (i));
5485       }
5486   }
5487
5488   Qtranslation_table = intern ("translation-table");
5489   staticpro (&Qtranslation_table);
5490   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
5491
5492   Qtranslation_table_id = intern ("translation-table-id");
5493   staticpro (&Qtranslation_table_id);
5494
5495   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5496   staticpro (&Qtranslation_table_for_decode);
5497
5498   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5499   staticpro (&Qtranslation_table_for_encode);
5500
5501   Qsafe_charsets = intern ("safe-charsets");
5502   staticpro (&Qsafe_charsets);
5503
5504   Qvalid_codes = intern ("valid-codes");
5505   staticpro (&Qvalid_codes);
5506
5507   Qemacs_mule = intern ("emacs-mule");
5508   staticpro (&Qemacs_mule);
5509
5510   Qraw_text = intern ("raw-text");
5511   staticpro (&Qraw_text);
5512
5513   defsubr (&Scoding_system_p);
5514   defsubr (&Sread_coding_system);
5515   defsubr (&Sread_non_nil_coding_system);
5516   defsubr (&Scheck_coding_system);
5517   defsubr (&Sdetect_coding_region);
5518   defsubr (&Sdetect_coding_string);
5519   defsubr (&Sdecode_coding_region);
5520   defsubr (&Sencode_coding_region);
5521   defsubr (&Sdecode_coding_string);
5522   defsubr (&Sencode_coding_string);
5523   defsubr (&Sdecode_sjis_char);
5524   defsubr (&Sencode_sjis_char);
5525   defsubr (&Sdecode_big5_char);
5526   defsubr (&Sencode_big5_char);
5527   defsubr (&Sset_terminal_coding_system_internal);
5528   defsubr (&Sset_safe_terminal_coding_system_internal);
5529   defsubr (&Sterminal_coding_system);
5530   defsubr (&Sset_keyboard_coding_system_internal);
5531   defsubr (&Skeyboard_coding_system);
5532   defsubr (&Sfind_operation_coding_system);
5533   defsubr (&Supdate_coding_systems_internal);
5534   defsubr (&Sset_coding_priority_internal);
5535
5536   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5537     "List of coding systems.\n\
5538 \n\
5539 Do not alter the value of this variable manually.  This variable should be\n\
5540 updated by the functions `make-coding-system' and\n\
5541 `define-coding-system-alias'.");
5542   Vcoding_system_list = Qnil;
5543
5544   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5545     "Alist of coding system names.\n\
5546 Each element is one element list of coding system name.\n\
5547 This variable is given to `completing-read' as TABLE argument.\n\
5548 \n\
5549 Do not alter the value of this variable manually.  This variable should be\n\
5550 updated by the functions `make-coding-system' and\n\
5551 `define-coding-system-alias'.");
5552   Vcoding_system_alist = Qnil;
5553
5554   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5555     "List of coding-categories (symbols) ordered by priority.");
5556   {
5557     int i;
5558
5559     Vcoding_category_list = Qnil;
5560     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5561       Vcoding_category_list
5562         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5563                  Vcoding_category_list);
5564   }
5565
5566   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
5567     "Specify the coding system for read operations.\n\
5568 It is useful to bind this variable with `let', but do not set it globally.\n\
5569 If the value is a coding system, it is used for decoding on read operation.\n\
5570 If not, an appropriate element is used from one of the coding system alists:\n\
5571 There are three such tables, `file-coding-system-alist',\n\
5572 `process-coding-system-alist', and `network-coding-system-alist'.");
5573   Vcoding_system_for_read = Qnil;
5574
5575   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
5576     "Specify the coding system for write operations.\n\
5577 It is useful to bind this variable with `let', but do not set it globally.\n\
5578 If the value is a coding system, it is used for encoding on write operation.\n\
5579 If not, an appropriate element is used from one of the coding system alists:\n\
5580 There are three such tables, `file-coding-system-alist',\n\
5581 `process-coding-system-alist', and `network-coding-system-alist'.");
5582   Vcoding_system_for_write = Qnil;
5583
5584   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
5585     "Coding system used in the latest file or process I/O.");
5586   Vlast_coding_system_used = Qnil;
5587
5588   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
5589     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
5590 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5591 such conversion.");
5592   inhibit_eol_conversion = 0;
5593
5594   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5595     "Non-nil means process buffer inherits coding system of process output.\n\
5596 Bind it to t if the process output is to be treated as if it were a file\n\
5597 read from some filesystem.");
5598   inherit_process_coding_system = 0;
5599
5600   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5601     "Alist to decide a coding system to use for a file I/O operation.\n\
5602 The format is ((PATTERN . VAL) ...),\n\
5603 where PATTERN is a regular expression matching a file name,\n\
5604 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5605 If VAL is a coding system, it is used for both decoding and encoding\n\
5606 the file contents.\n\
5607 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5608 and the cdr part is used for encoding.\n\
5609 If VAL is a function symbol, the function must return a coding system\n\
5610 or a cons of coding systems which are used as above.\n\
5611 \n\
5612 See also the function `find-operation-coding-system'\n\
5613 and the variable `auto-coding-alist'.");
5614   Vfile_coding_system_alist = Qnil;
5615
5616   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5617     "Alist to decide a coding system to use for a process I/O operation.\n\
5618 The format is ((PATTERN . VAL) ...),\n\
5619 where PATTERN is a regular expression matching a program name,\n\
5620 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5621 If VAL is a coding system, it is used for both decoding what received\n\
5622 from the program and encoding what sent to the program.\n\
5623 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5624 and the cdr part is used for encoding.\n\
5625 If VAL is a function symbol, the function must return a coding system\n\
5626 or a cons of coding systems which are used as above.\n\
5627 \n\
5628 See also the function `find-operation-coding-system'.");
5629   Vprocess_coding_system_alist = Qnil;
5630
5631   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5632     "Alist to decide a coding system to use for a network I/O operation.\n\
5633 The format is ((PATTERN . VAL) ...),\n\
5634 where PATTERN is a regular expression matching a network service name\n\
5635 or is a port number to connect to,\n\
5636 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5637 If VAL is a coding system, it is used for both decoding what received\n\
5638 from the network stream and encoding what sent to the network stream.\n\
5639 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5640 and the cdr part is used for encoding.\n\
5641 If VAL is a function symbol, the function must return a coding system\n\
5642 or a cons of coding systems which are used as above.\n\
5643 \n\
5644 See also the function `find-operation-coding-system'.");
5645   Vnetwork_coding_system_alist = Qnil;
5646
5647   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5648     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5649   eol_mnemonic_unix = build_string (":");
5650
5651   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5652     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5653   eol_mnemonic_dos = build_string ("\\");
5654
5655   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5656     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5657   eol_mnemonic_mac = build_string ("/");
5658
5659   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5660     "*String displayed in mode line when end-of-line format is not yet determined.");
5661   eol_mnemonic_undecided = build_string (":");
5662
5663   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
5664     "*Non-nil enables character translation while encoding and decoding.");
5665   Venable_character_translation = Qt;
5666
5667   DEFVAR_LISP ("standard-translation-table-for-decode",
5668     &Vstandard_translation_table_for_decode,
5669     "Table for translating characters while decoding.");
5670   Vstandard_translation_table_for_decode = Qnil;
5671
5672   DEFVAR_LISP ("standard-translation-table-for-encode",
5673     &Vstandard_translation_table_for_encode,
5674     "Table for translationg characters while encoding.");
5675   Vstandard_translation_table_for_encode = Qnil;
5676
5677   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5678     "Alist of charsets vs revision numbers.\n\
5679 While encoding, if a charset (car part of an element) is found,\n\
5680 designate it with the escape sequence identifing revision (cdr part of the element).");
5681   Vcharset_revision_alist = Qnil;
5682
5683   DEFVAR_LISP ("default-process-coding-system",
5684                &Vdefault_process_coding_system,
5685     "Cons of coding systems used for process I/O by default.\n\
5686 The car part is used for decoding a process output,\n\
5687 the cdr part is used for encoding a text to be sent to a process.");
5688   Vdefault_process_coding_system = Qnil;
5689
5690   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5691     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5692 This is a vector of length 256.\n\
5693 If Nth element is non-nil, the existence of code N in a file\n\
5694 \(or output of subprocess) doesn't prevent it to be detected as\n\
5695 a coding system of ISO 2022 variant which has a flag\n\
5696 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5697 or reading output of a subprocess.\n\
5698 Only 128th through 159th elements has a meaning.");
5699   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
5700
5701   DEFVAR_LISP ("select-safe-coding-system-function",
5702                &Vselect_safe_coding_system_function,
5703     "Function to call to select safe coding system for encoding a text.\n\
5704 \n\
5705 If set, this function is called to force a user to select a proper\n\
5706 coding system which can encode the text in the case that a default\n\
5707 coding system used in each operation can't encode the text.\n\
5708 \n\
5709 The default value is `select-safe-coding-system' (which see).");
5710   Vselect_safe_coding_system_function = Qnil;
5711
5712 }
5713
5714 #endif /* emacs */