src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. CCL handlers
  29   6. End-of-line handlers
  30   7. C library functions
  31   8. Emacs Lisp library functions
  32   9. Post-amble
  33
  34 */
  35
  36 /*** GENERAL NOTE on CODING SYSTEM ***
  37
  38   Coding system is an encoding mechanism of one or more character
  39   sets.  Here's a list of coding systems which Emacs can handle.  When
  40   we say "decode", it means converting some other coding system to
  41   Emacs' internal format (emacs-internal), and when we say "encode",
  42   it means converting the coding system emacs-mule to some other
  43   coding system.
  44
  45   0. Emacs' internal format (emacs-mule)
  46
  47   Emacs itself holds a multi-lingual character in a buffer and a string
  48   in a special format.  Details are described in section 2.
  49
  50   1. ISO2022
  51
  52   The most famous coding system for multiple character sets.  X's
  53   Compound Text, various EUCs (Extended Unix Code), and coding
  54   systems used in Internet communication such as ISO-2022-JP are
  55   all variants of ISO2022.  Details are described in section 3.
  56
  57   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  58
  59   A coding system to encode character sets: ASCII, JISX0201, and
  60   JISX0208.  Widely used for PC's in Japan.  Details are described in
  61   section 4.
  62
  63   3. BIG5
  64
  65   A coding system to encode character sets: ASCII and Big5.  Widely
  66   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  67   described in section 4.  In this file, when we write "BIG5"
  68   (all uppercase), we mean the coding system, and when we write
  69   "Big5" (capitalized), we mean the character set.
  70
  71   4. Raw text
  72
  73   A coding system for a text containing random 8-bit code.  Emacs does
  74   no code conversion on such a text except for end-of-line format.
  75
  76   5. Other
  77
  78   If a user wants to read/write a text encoded in a coding system not
  79   listed above, he can supply a decoder and an encoder for it in CCL
  80   (Code Conversion Language) programs.  Emacs executes the CCL program
  81   while reading/writing.
  82
  83   Emacs represents a coding system by a Lisp symbol that has a property
  84   `coding-system'.  But, before actually using the coding system, the
  85   information about it is set in a structure of type `struct
  86   coding_system' for rapid processing.  See section 6 for more details.
  87
  88 */
  89
  90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  91
  92   How end-of-line of a text is encoded depends on a system.  For
  93   instance, Unix's format is just one byte of `line-feed' code,
  94   whereas DOS's format is two-byte sequence of `carriage-return' and
  95   `line-feed' codes.  MacOS's format is usually one byte of
  96   `carriage-return'.
  97
  98   Since text characters encoding and end-of-line encoding are
  99   independent, any coding system described above can take
 100   any format of end-of-line.  So, Emacs has information of format of
 101   end-of-line in each coding-system.  See section 6 for more details.
 102
 103 */
 104
 105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 106
 107   These functions check if a text between SRC and SRC_END is encoded
 108   in the coding system category XXX.  Each returns an integer value in
 109   which appropriate flag bits for the category XXX is set.  The flag
 110   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 111   template of these functions.  */
 112 #if 0
 113 int
 114 detect_coding_emacs_mule (src, src_end)
 115      unsigned char *src, *src_end;
 116 {
 117   ...
 118 }
 119 #endif
 120
 121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 122
 123   These functions decode SRC_BYTES length text at SOURCE encoded in
 124   CODING to Emacs' internal format (emacs-mule).  The resulting text
 125   goes to a place pointed to by DESTINATION, the length of which
 126   should not exceed DST_BYTES.  These functions set the information of
 127   original and decoded texts in the members produced, produced_char,
 128   consumed, and consumed_char of the structure *CODING.
 129
 130   The return value is an integer (CODING_FINISH_XXX) indicating how
 131   the decoding finished.
 132
 133   DST_BYTES zero means that source area and destination area are
 134   overlapped, which means that we can produce a decoded text until it
 135   reaches at the head of not-yet-decoded source text.
 136
 137   Below is a template of these functions.  */
 138 #if 0
 139 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 140      struct coding_system *coding;
 141      unsigned char *source, *destination;
 142      int src_bytes, dst_bytes;
 143 {
 144   ...
 145 }
 146 #endif
 147
 148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 149
 150   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 151   internal format (emacs-mule) to CODING.  The resulting text goes to
 152   a place pointed to by DESTINATION, the length of which should not
 153   exceed DST_BYTES.  These functions set the information of
 154   original and encoded texts in the members produced, produced_char,
 155   consumed, and consumed_char of the structure *CODING.
 156
 157   The return value is an integer (CODING_FINISH_XXX) indicating how
 158   the encoding finished.
 159
 160   DST_BYTES zero means that source area and destination area are
 161   overlapped, which means that we can produce a decoded text until it
 162   reaches at the head of not-yet-decoded source text.
 163
 164   Below is a template of these functions.  */
 165 #if 0
 166 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 167      struct coding_system *coding;
 168      unsigned char *source, *destination;
 169      int src_bytes, dst_bytes;
 170 {
 171   ...
 172 }
 173 #endif
 174
 175 /*** COMMONLY USED MACROS ***/
 176
 177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 178    THREE_MORE_BYTES safely get one, two, and three bytes from the
 179    source text respectively.  If there are not enough bytes in the
 180    source, they jump to `label_end_of_loop'.  The caller should set
 181    variables `src' and `src_end' to appropriate areas in advance.  */
 182
 183 #define ONE_MORE_BYTE(c1)       \
 184   do {                          \
 185     if (src < src_end)          \
 186       c1 = *src++;              \
 187     else                        \
 188       goto label_end_of_loop;   \
 189   } while (0)
 190
 191 #define TWO_MORE_BYTES(c1, c2)  \
 192   do {                          \
 193     if (src + 1 < src_end)      \
 194       c1 = *src++, c2 = *src++; \
 195     else                        \
 196       goto label_end_of_loop;   \
 197   } while (0)
 198
 199 #define THREE_MORE_BYTES(c1, c2, c3)            \
 200   do {                                          \
 201     if (src + 2 < src_end)                      \
 202       c1 = *src++, c2 = *src++, c3 = *src++;    \
 203     else                                        \
 204       goto label_end_of_loop;                   \
 205   } while (0)
 206
 207 /* The following three macros DECODE_CHARACTER_ASCII,
 208    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 209    the multi-byte form of a character of each class at the place
 210    pointed by `dst'.  The caller should set the variable `dst' to
 211    point to an appropriate area and the variable `coding' to point to
 212    the coding-system of the currently decoding text in advance.  */
 213
 214 /* Decode one ASCII character C.  */
 215
 216 #define DECODE_CHARACTER_ASCII(c)               \
 217   do {                                          \
 218     if (COMPOSING_P (coding->composing))        \
 219       {                                         \
 220         *dst++ = 0xA0, *dst++ = (c) | 0x80;     \
 221         coding->composed_chars++;               \
 222         if (((c) | 0x80) < 0xA0)                \
 223           coding->fake_multibyte = 1;           \
 224       }                                         \
 225     else                                        \
 226       {                                         \
 227         *dst++ = (c);                           \
 228         coding->produced_char++;                \
 229         if ((c) >= 0x80)                        \
 230           coding->fake_multibyte = 1;           \
 231       }                                         \
 232   } while (0)
 233
 234 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 235    position-code is C.  */
 236
 237 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 238   do {                                                                  \
 239     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 240     if (COMPOSING_P (coding->composing))                                \
 241       {                                                                 \
 242         *dst++ = leading_code + 0x20;                                   \
 243         coding->composed_chars++;                                       \
 244       }                                                                 \
 245     else                                                                \
 246       {                                                                 \
 247         *dst++ = leading_code;                                          \
 248         coding->produced_char++;                                        \
 249       }                                                                 \
 250     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 251       *dst++ = leading_code;                                            \
 252     *dst++ = (c) | 0x80;                                                \
 253     if (((c) | 0x80)  < 0xA0)                                           \
 254       coding->fake_multibyte = 1;                                       \
 255   } while (0)
 256
 257 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 258    position-codes are C1 and C2.  */
 259
 260 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 261   do {                                                  \
 262     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 263     *dst++ = (c2) | 0x80;                               \
 264     if (((c2) | 0x80) < 0xA0)                           \
 265       coding->fake_multibyte = 1;                       \
 266   } while (0)
 267
 268 \f
 269 /*** 1. Preamble ***/
 270
 271 #include <stdio.h>
 272
 273 #ifdef emacs
 274
 275 #include <config.h>
 276 #include "lisp.h"
 277 #include "buffer.h"
 278 #include "charset.h"
 279 #include "ccl.h"
 280 #include "coding.h"
 281 #include "window.h"
 282
 283 #else  /* not emacs */
 284
 285 #include "mulelib.h"
 286
 287 #endif /* not emacs */
 288
 289 Lisp_Object Qcoding_system, Qeol_type;
 290 Lisp_Object Qbuffer_file_coding_system;
 291 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 292 Lisp_Object Qno_conversion, Qundecided;
 293 Lisp_Object Qcoding_system_history;
 294 Lisp_Object Qsafe_charsets;
 295 Lisp_Object Qvalid_codes;
 296
 297 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 298 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 299 Lisp_Object Qstart_process, Qopen_network_stream;
 300 Lisp_Object Qtarget_idx;
 301
 302 Lisp_Object Vselect_safe_coding_system_function;
 303
 304 /* Mnemonic string for each format of end-of-line.  */
 305 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 306 /* Mnemonic string to indicate format of end-of-line is not yet
 307    decided.  */
 308 Lisp_Object eol_mnemonic_undecided;
 309
 310 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 311    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 312 int system_eol_type;
 313
 314 #ifdef emacs
 315
 316 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 317
 318 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 319
 320 /* Coding system emacs-mule and raw-text are for converting only
 321    end-of-line format.  */
 322 Lisp_Object Qemacs_mule, Qraw_text;
 323
 324 /* Coding-systems are handed between Emacs Lisp programs and C internal
 325    routines by the following three variables.  */
 326 /* Coding-system for reading files and receiving data from process.  */
 327 Lisp_Object Vcoding_system_for_read;
 328 /* Coding-system for writing files and sending data to process.  */
 329 Lisp_Object Vcoding_system_for_write;
 330 /* Coding-system actually used in the latest I/O.  */
 331 Lisp_Object Vlast_coding_system_used;
 332
 333 /* A vector of length 256 which contains information about special
 334    Latin codes (especially for dealing with Microsoft codes).  */
 335 Lisp_Object Vlatin_extra_code_table;
 336
 337 /* Flag to inhibit code conversion of end-of-line format.  */
 338 int inhibit_eol_conversion;
 339
 340 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 341 int inherit_process_coding_system;
 342
 343 /* Coding system to be used to encode text for terminal display.  */
 344 struct coding_system terminal_coding;
 345
 346 /* Coding system to be used to encode text for terminal display when
 347    terminal coding system is nil.  */
 348 struct coding_system safe_terminal_coding;
 349
 350 /* Coding system of what is sent from terminal keyboard.  */
 351 struct coding_system keyboard_coding;
 352
 353 /* Default coding system to be used to write a file.  */
 354 struct coding_system default_buffer_file_coding;
 355
 356 Lisp_Object Vfile_coding_system_alist;
 357 Lisp_Object Vprocess_coding_system_alist;
 358 Lisp_Object Vnetwork_coding_system_alist;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qcoding_category, Qcoding_category_index;
 363
 364 /* List of symbols `coding-category-xxx' ordered by priority.  */
 365 Lisp_Object Vcoding_category_list;
 366
 367 /* Table of coding categories (Lisp symbols).  */
 368 Lisp_Object Vcoding_category_table;
 369
 370 /* Table of names of symbol for each coding-category.  */
 371 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 372   "coding-category-emacs-mule",
 373   "coding-category-sjis",
 374   "coding-category-iso-7",
 375   "coding-category-iso-7-tight",
 376   "coding-category-iso-8-1",
 377   "coding-category-iso-8-2",
 378   "coding-category-iso-7-else",
 379   "coding-category-iso-8-else",
 380   "coding-category-ccl",
 381   "coding-category-big5",
 382   "coding-category-raw-text",
 383   "coding-category-binary"
 384 };
 385
 386 /* Table of pointers to coding systems corresponding to each coding
 387    categories.  */
 388 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 389
 390 /* Table of coding category masks.  Nth element is a mask for a coding
 391    cateogry of which priority is Nth.  */
 392 static
 393 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 Lisp_Object Vcharset_revision_alist;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 \f
 415 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 416
 417 /* Emacs' internal format for encoding multiple character sets is a
 418    kind of multi-byte encoding, i.e. characters are encoded by
 419    variable-length sequences of one-byte codes.  ASCII characters
 420    and control characters (e.g. `tab', `newline') are represented by
 421    one-byte sequences which are their ASCII codes, in the range 0x00
 422    through 0x7F.  The other characters are represented by a sequence
 423    of `base leading-code', optional `extended leading-code', and one
 424    or two `position-code's.  The length of the sequence is determined
 425    by the base leading-code.  Leading-code takes the range 0x80
 426    through 0x9F, whereas extended leading-code and position-code take
 427    the range 0xA0 through 0xFF.  See `charset.h' for more details
 428    about leading-code and position-code.
 429
 430    There's one exception to this rule.  Special leading-code
 431    `leading-code-composition' denotes that the following several
 432    characters should be composed into one character.  Leading-codes of
 433    components (except for ASCII) are added 0x20.  An ASCII character
 434    component is represented by a 2-byte sequence of `0xA0' and
 435    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 436    details of composite character.  Hence, we can summarize the code
 437    range as follows:
 438
 439    --- CODE RANGE of Emacs' internal format ---
 440    (character set)      (range)
 441    ASCII                0x00 .. 0x7F
 442    ELSE (1st byte)      0x80 .. 0x9F
 443         (rest bytes)    0xA0 .. 0xFF
 444    ---------------------------------------------
 445
 446   */
 447
 448 enum emacs_code_class_type emacs_code_class[256];
 449
 450 /* Go to the next statement only if *SRC is accessible and the code is
 451    greater than 0xA0.  */
 452 #define CHECK_CODE_RANGE_A0_FF  \
 453   do {                          \
 454     if (src >= src_end)         \
 455       goto label_end_of_switch; \
 456     else if (*src++ < 0xA0)     \
 457       return 0;                 \
 458   } while (0)
 459
 460 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 461    Check if a text is encoded in Emacs' internal format.  If it is,
 462    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 463
 464 int
 465 detect_coding_emacs_mule (src, src_end)
 466      unsigned char *src, *src_end;
 467 {
 468   unsigned char c;
 469   int composing = 0;
 470
 471   while (src < src_end)
 472     {
 473       c = *src++;
 474
 475       if (composing)
 476         {
 477           if (c < 0xA0)
 478             composing = 0;
 479           else
 480             c -= 0x20;
 481         }
 482
 483       switch (emacs_code_class[c])
 484         {
 485         case EMACS_ascii_code:
 486         case EMACS_linefeed_code:
 487           break;
 488
 489         case EMACS_control_code:
 490           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 491             return 0;
 492           break;
 493
 494         case EMACS_invalid_code:
 495           return 0;
 496
 497         case EMACS_leading_code_composition: /* c == 0x80 */
 498           if (composing)
 499             CHECK_CODE_RANGE_A0_FF;
 500           else
 501             composing = 1;
 502           break;
 503
 504         case EMACS_leading_code_4:
 505           CHECK_CODE_RANGE_A0_FF;
 506           /* fall down to check it two more times ...  */
 507
 508         case EMACS_leading_code_3:
 509           CHECK_CODE_RANGE_A0_FF;
 510           /* fall down to check it one more time ...  */
 511
 512         case EMACS_leading_code_2:
 513           CHECK_CODE_RANGE_A0_FF;
 514           break;
 515
 516         default:
 517         label_end_of_switch:
 518           break;
 519         }
 520     }
 521   return CODING_CATEGORY_MASK_EMACS_MULE;
 522 }
 523
 524 \f
 525 /*** 3. ISO2022 handlers ***/
 526
 527 /* The following note describes the coding system ISO2022 briefly.
 528    Since the intention of this note is to help in understanding of
 529    the programs in this file, some parts are NOT ACCURATE or OVERLY
 530    SIMPLIFIED.  For the thorough understanding, please refer to the
 531    original document of ISO2022.
 532
 533    ISO2022 provides many mechanisms to encode several character sets
 534    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 535    all text is encoded by codes of less than 128.  This may make the
 536    encoded text a little bit longer, but the text gets more stability
 537    to pass through several gateways (some of them strip off the MSB).
 538
 539    There are two kinds of character set: control character set and
 540    graphic character set.  The former contains control characters such
 541    as `newline' and `escape' to provide control functions (control
 542    functions are provided also by escape sequences).  The latter
 543    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 544    two control character sets and many graphic character sets.
 545
 546    Graphic character sets are classified into one of the following
 547    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 548    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 549    bytes (DIMENSION) and the number of characters in one dimension
 550    (CHARS) of the set.  In addition, each character set is assigned an
 551    identification tag (called "final character" and denoted as <F>
 552    here after) which is unique in each class.  <F> of each character
 553    set is decided by ECMA(*) when it is registered in ISO.  Code range
 554    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 555
 556    Note (*): ECMA = European Computer Manufacturers Association
 557
 558    Here are examples of graphic character set [NAME(<F>)]:
 559         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 560         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 561         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 562         o DIMENSION2_CHARS96 -- none for the moment
 563
 564    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 565         C0 [0x00..0x1F] -- control character plane 0
 566         GL [0x20..0x7F] -- graphic character plane 0
 567         C1 [0x80..0x9F] -- control character plane 1
 568         GR [0xA0..0xFF] -- graphic character plane 1
 569
 570    A control character set is directly designated and invoked to C0 or
 571    C1 by an escape sequence.  The most common case is that ISO646's
 572    control character set is designated/invoked to C0 and ISO6429's
 573    control character set is designated/invoked to C1, and usually
 574    these designations/invocations are omitted in a coded text.  With
 575    7-bit environment, only C0 can be used, and a control character for
 576    C1 is encoded by an appropriate escape sequence to fit in the
 577    environment.  All control characters for C1 are defined the
 578    corresponding escape sequences.
 579
 580    A graphic character set is at first designated to one of four
 581    graphic registers (G0 through G3), then these graphic registers are
 582    invoked to GL or GR.  These designations and invocations can be
 583    done independently.  The most common case is that G0 is invoked to
 584    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 585    these invocations and designations are omitted in a coded text.
 586    With 7-bit environment, only GL can be used.
 587
 588    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 589    and 0x7F of GL area work as control characters SPACE and DEL
 590    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 591
 592    There are two ways of invocation: locking-shift and single-shift.
 593    With locking-shift, the invocation lasts until the next different
 594    invocation, whereas with single-shift, the invocation works only
 595    for the following character and doesn't affect locking-shift.
 596    Invocations are done by the following control characters or escape
 597    sequences.
 598
 599    ----------------------------------------------------------------------
 600    function             control char    escape sequence description
 601    ----------------------------------------------------------------------
 602    SI  (shift-in)               0x0F    none            invoke G0 to GL
 603    SO  (shift-out)              0x0E    none            invoke G1 to GL
 604    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 605    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 606    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 607    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 608    ----------------------------------------------------------------------
 609    The first four are for locking-shift.  Control characters for these
 610    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 611
 612    Designations are done by the following escape sequences.
 613    ----------------------------------------------------------------------
 614    escape sequence      description
 615    ----------------------------------------------------------------------
 616    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 617    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 618    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 619    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 620    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 621    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 622    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 623    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 624    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 625    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 626    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 627    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 628    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 629    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 630    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 631    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 632    ----------------------------------------------------------------------
 633
 634    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 635    of dimension 1, chars 94, and final character <F>, and etc.
 636
 637    Note (*): Although these designations are not allowed in ISO2022,
 638    Emacs accepts them on decoding, and produces them on encoding
 639    CHARS96 character set in a coding system which is characterized as
 640    7-bit environment, non-locking-shift, and non-single-shift.
 641
 642    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 643    '(' can be omitted.  We call this as "short-form" here after.
 644
 645    Now you may notice that there are a lot of ways for encoding the
 646    same multilingual text in ISO2022.  Actually, there exists many
 647    coding systems such as Compound Text (used in X's inter client
 648    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 649    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 650    localized platforms), and all of these are variants of ISO2022.
 651
 652    In addition to the above, Emacs handles two more kinds of escape
 653    sequences: ISO6429's direction specification and Emacs' private
 654    sequence for specifying character composition.
 655
 656    ISO6429's direction specification takes the following format:
 657         o CSI ']'      -- end of the current direction
 658         o CSI '0' ']'  -- end of the current direction
 659         o CSI '1' ']'  -- start of left-to-right text
 660         o CSI '2' ']'  -- start of right-to-left text
 661    The control character CSI (0x9B: control sequence introducer) is
 662    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 663
 664    Character composition specification takes the following format:
 665         o ESC '0' -- start character composition
 666         o ESC '1' -- end character composition
 667    Since these are not standard escape sequences of any ISO, the use
 668    of them for these meaning is restricted to Emacs only.  */
 669
 670 enum iso_code_class_type iso_code_class[256];
 671
 672 #define CHARSET_OK(idx, charset)                                \
 673   (coding_system_table[idx]                                     \
 674    && (coding_system_table[idx]->safe_charsets[charset]         \
 675        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 676             (coding_system_table[idx], charset)                 \
 677            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 678
 679 #define SHIFT_OUT_OK(idx) \
 680   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 681
 682 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 683    Check if a text is encoded in ISO2022.  If it is, returns an
 684    integer in which appropriate flag bits any of:
 685         CODING_CATEGORY_MASK_ISO_7
 686         CODING_CATEGORY_MASK_ISO_7_TIGHT
 687         CODING_CATEGORY_MASK_ISO_8_1
 688         CODING_CATEGORY_MASK_ISO_8_2
 689         CODING_CATEGORY_MASK_ISO_7_ELSE
 690         CODING_CATEGORY_MASK_ISO_8_ELSE
 691    are set.  If a code which should never appear in ISO2022 is found,
 692    returns 0.  */
 693
 694 int
 695 detect_coding_iso2022 (src, src_end)
 696      unsigned char *src, *src_end;
 697 {
 698   int mask = CODING_CATEGORY_MASK_ISO;
 699   int mask_found = 0;
 700   int reg[4], shift_out = 0, single_shifting = 0;
 701   int c, c1, i, charset;
 702
 703   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 704   while (mask && src < src_end)
 705     {
 706       c = *src++;
 707       switch (c)
 708         {
 709         case ISO_CODE_ESC:
 710           single_shifting = 0;
 711           if (src >= src_end)
 712             break;
 713           c = *src++;
 714           if (c >= '(' && c <= '/')
 715             {
 716               /* Designation sequence for a charset of dimension 1.  */
 717               if (src >= src_end)
 718                 break;
 719               c1 = *src++;
 720               if (c1 < ' ' || c1 >= 0x80
 721                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 722                 /* Invalid designation sequence.  Just ignore.  */
 723                 break;
 724               reg[(c - '(') % 4] = charset;
 725             }
 726           else if (c == '$')
 727             {
 728               /* Designation sequence for a charset of dimension 2.  */
 729               if (src >= src_end)
 730                 break;
 731               c = *src++;
 732               if (c >= '@' && c <= 'B')
 733                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 734                 reg[0] = charset = iso_charset_table[1][0][c];
 735               else if (c >= '(' && c <= '/')
 736                 {
 737                   if (src >= src_end)
 738                     break;
 739                   c1 = *src++;
 740                   if (c1 < ' ' || c1 >= 0x80
 741                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 742                     /* Invalid designation sequence.  Just ignore.  */
 743                     break;
 744                   reg[(c - '(') % 4] = charset;
 745                 }
 746               else
 747                 /* Invalid designation sequence.  Just ignore.  */
 748                 break;
 749             }
 750           else if (c == 'N' || c == 'O')
 751             {
 752               /* ESC <Fe> for SS2 or SS3.  */
 753               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 754               break;
 755             }
 756           else if (c == '0' || c == '1' || c == '2')
 757             /* ESC <Fp> for start/end composition.  Just ignore.  */
 758             break;
 759           else
 760             /* Invalid escape sequence.  Just ignore.  */
 761             break;
 762
 763           /* We found a valid designation sequence for CHARSET.  */
 764           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 765           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 766             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 767           else
 768             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 769           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 770             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 771           else
 772             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 773           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 774             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 775           else
 776             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 777           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 778             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 779           else
 780             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 781           break;
 782
 783         case ISO_CODE_SO:
 784           single_shifting = 0;
 785           if (shift_out == 0
 786               && (reg[1] >= 0
 787                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 788                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 789             {
 790               /* Locking shift out.  */
 791               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 792               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 793             }
 794           break;
 795
 796         case ISO_CODE_SI:
 797           single_shifting = 0;
 798           if (shift_out == 1)
 799             {
 800               /* Locking shift in.  */
 801               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 802               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 803             }
 804           break;
 805
 806         case ISO_CODE_CSI:
 807           single_shifting = 0;
 808         case ISO_CODE_SS2:
 809         case ISO_CODE_SS3:
 810           {
 811             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 812
 813             if (c != ISO_CODE_CSI)
 814               {
 815                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 816                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 817                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 818                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 819                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 820                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 821                 single_shifting = 1;
 822               }
 823             if (VECTORP (Vlatin_extra_code_table)
 824                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 825               {
 826                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 827                     & CODING_FLAG_ISO_LATIN_EXTRA)
 828                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 829                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 830                     & CODING_FLAG_ISO_LATIN_EXTRA)
 831                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 832               }
 833             mask &= newmask;
 834             mask_found |= newmask;
 835           }
 836           break;
 837
 838         default:
 839           if (c < 0x80)
 840             {
 841               single_shifting = 0;
 842               break;
 843             }
 844           else if (c < 0xA0)
 845             {
 846               single_shifting = 0;
 847               if (VECTORP (Vlatin_extra_code_table)
 848                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 849                 {
 850                   int newmask = 0;
 851
 852                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 853                       & CODING_FLAG_ISO_LATIN_EXTRA)
 854                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 855                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 856                       & CODING_FLAG_ISO_LATIN_EXTRA)
 857                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 858                   mask &= newmask;
 859                   mask_found |= newmask;
 860                 }
 861               else
 862                 return 0;
 863             }
 864           else
 865             {
 866               unsigned char *src_begin = src;
 867
 868               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 869                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 870               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 871               /* Check the length of succeeding codes of the range
 872                  0xA0..0FF.  If the byte length is odd, we exclude
 873                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
 874                  when we are not single shifting.  */
 875               if (!single_shifting)
 876                 {
 877                   while (src < src_end && *src >= 0xA0)
 878                     src++;
 879                   if ((src - src_begin - 1) & 1 && src < src_end)
 880                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 881                   else
 882                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
 883                 }
 884             }
 885           break;
 886         }
 887     }
 888
 889   return (mask & mask_found);
 890 }
 891
 892 /* Decode a character of which charset is CHARSET and the 1st position
 893    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 894    fetched from SRC and set to C2.  If CHARSET is negative, it means
 895    that we are decoding ill formed text, and what we can do is just to
 896    read C1 as is.  */
 897
 898 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 899   do {                                                                  \
 900     int c_alt, charset_alt = (charset);                                 \
 901     if (COMPOSING_HEAD_P (coding->composing))                           \
 902       {                                                                 \
 903         *dst++ = LEADING_CODE_COMPOSITION;                              \
 904         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 905           /* To tell composition rules are embeded.  */                 \
 906           *dst++ = 0xFF;                                                \
 907         coding->composing += 2;                                         \
 908       }                                                                 \
 909     if (charset_alt >= 0)                                               \
 910       {                                                                 \
 911         if (CHARSET_DIMENSION (charset_alt) == 2)                       \
 912           {                                                             \
 913             ONE_MORE_BYTE (c2);                                         \
 914             if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F         \
 915                 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0)  \
 916               {                                                         \
 917                 src--;                                                  \
 918                 charset_alt = CHARSET_ASCII;                            \
 919               }                                                         \
 920           }                                                             \
 921         if (!NILP (translation_table)                                   \
 922             && ((c_alt = translate_char (translation_table,             \
 923                                          -1, charset_alt, c1, c2)) >= 0)) \
 924           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 925       }                                                                 \
 926     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 927       DECODE_CHARACTER_ASCII (c1);                                      \
 928     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 929       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 930     else                                                                \
 931       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 932     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 933       /* To tell a composition rule follows.  */                        \
 934       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 935   } while (0)
 936
 937 /* Set designation state into CODING.  */
 938 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
 939   do {                                                                     \
 940     int charset;                                                           \
 941                                                                            \
 942     if (final_char < '0' || final_char >= 128)                             \
 943       goto label_invalid_code;                                             \
 944     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
 945                                  make_number (chars),                      \
 946                                  make_number (final_char));                \
 947     if (charset >= 0                                                       \
 948         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
 949             || coding->safe_charsets[charset]))                            \
 950       {                                                                    \
 951         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
 952             && reg == 0                                                    \
 953             && charset == CHARSET_ASCII)                                   \
 954           {                                                                \
 955             /* We should insert this designation sequence as is so         \
 956                that it is surely written back to a file.  */               \
 957             coding->spec.iso2022.last_invalid_designation_register = -1;   \
 958             goto label_invalid_code;                                       \
 959           }                                                                \
 960         coding->spec.iso2022.last_invalid_designation_register = -1;       \
 961         if ((coding->mode & CODING_MODE_DIRECTION)                         \
 962             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
 963           charset = CHARSET_REVERSE_CHARSET (charset);                     \
 964         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
 965       }                                                                    \
 966     else                                                                   \
 967       {                                                                    \
 968         coding->spec.iso2022.last_invalid_designation_register = reg;      \
 969         goto label_invalid_code;                                           \
 970       }                                                                    \
 971   } while (0)
 972
 973 /* Return 0 if there's a valid composing sequence starting at SRC and
 974    ending before SRC_END, else return -1.  */
 975
 976 int
 977 check_composing_code (coding, src, src_end)
 978      struct coding_system *coding;
 979      unsigned char *src, *src_end;
 980 {
 981   int charset, c, c1, dim;
 982
 983   while (src < src_end)
 984     {
 985       c = *src++;
 986       if (c >= 0x20)
 987         continue;
 988       if (c != ISO_CODE_ESC || src >= src_end)
 989         return -1;
 990       c = *src++;
 991       if (c == '1') /* end of compsition */
 992         return 0;
 993       if (src + 2 >= src_end
 994           || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
 995         return -1;
 996
 997       dim = (c == '$');
 998       if (dim == 1)
 999         c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1000       if (c >= '(' && c <= '/')
1001         {
1002           c1 = *src++;
1003           if ((c1 < ' ' || c1 >= 0x80)
1004               || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1005               || ! coding->safe_charsets[charset]
1006               || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1007                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1008             return -1;
1009         }
1010       else
1011         return -1;
1012     }
1013
1014   /* We have not found the sequence "ESC 1".  */
1015   return -1;
1016 }
1017
1018 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1019
1020 int
1021 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1022      struct coding_system *coding;
1023      unsigned char *source, *destination;
1024      int src_bytes, dst_bytes;
1025 {
1026   unsigned char *src = source;
1027   unsigned char *src_end = source + src_bytes;
1028   unsigned char *dst = destination;
1029   unsigned char *dst_end = destination + dst_bytes;
1030   /* Since the maximum bytes produced by each loop is 7, we subtract 6
1031      from DST_END to assure that overflow checking is necessary only
1032      at the head of loop.  */
1033   unsigned char *adjusted_dst_end = dst_end - 6;
1034   int charset;
1035   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1036   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1037   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1038   Lisp_Object translation_table
1039     = coding->translation_table_for_decode;
1040   int result = CODING_FINISH_NORMAL;
1041
1042   if (!NILP (Venable_character_translation) && NILP (translation_table))
1043     translation_table = Vstandard_translation_table_for_decode;
1044
1045   coding->produced_char = 0;
1046   coding->fake_multibyte = 0;
1047   while (src < src_end && (dst_bytes
1048                            ? (dst < adjusted_dst_end)
1049                            : (dst < src - 6)))
1050     {
1051       /* SRC_BASE remembers the start position in source in each loop.
1052          The loop will be exited when there's not enough source text
1053          to analyze long escape sequence or 2-byte code (within macros
1054          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
1055          to SRC_BASE before exiting.  */
1056       unsigned char *src_base = src;
1057       int c1 = *src++, c2;
1058
1059       switch (iso_code_class [c1])
1060         {
1061         case ISO_0x20_or_0x7F:
1062           if (!coding->composing
1063               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1064             {
1065               /* This is SPACE or DEL.  */
1066               *dst++ = c1;
1067               coding->produced_char++;
1068               break;
1069             }
1070           /* This is a graphic character, we fall down ...  */
1071
1072         case ISO_graphic_plane_0:
1073           if (coding->composing == COMPOSING_WITH_RULE_RULE)
1074             {
1075               /* This is a composition rule.  */
1076               *dst++ = c1 | 0x80;
1077               coding->composing = COMPOSING_WITH_RULE_TAIL;
1078             }
1079           else
1080             DECODE_ISO_CHARACTER (charset0, c1);
1081           break;
1082
1083         case ISO_0xA0_or_0xFF:
1084           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1085               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1086             goto label_invalid_code;
1087           /* This is a graphic character, we fall down ... */
1088
1089         case ISO_graphic_plane_1:
1090           if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1091             goto label_invalid_code;
1092           else
1093             DECODE_ISO_CHARACTER (charset1, c1);
1094           break;
1095
1096         case ISO_control_code:
1097           /* All ISO2022 control characters in this class have the
1098              same representation in Emacs internal format.  */
1099           if (c1 == '\n'
1100               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1101               && (coding->eol_type == CODING_EOL_CR
1102                   || coding->eol_type == CODING_EOL_CRLF))
1103             {
1104               result = CODING_FINISH_INCONSISTENT_EOL;
1105               goto label_end_of_loop_2;
1106             }
1107           *dst++ = c1;
1108           coding->produced_char++;
1109           if (c1 >= 0x80)
1110             coding->fake_multibyte = 1;
1111           break;
1112
1113         case ISO_carriage_return:
1114           if (coding->eol_type == CODING_EOL_CR)
1115             *dst++ = '\n';
1116           else if (coding->eol_type == CODING_EOL_CRLF)
1117             {
1118               ONE_MORE_BYTE (c1);
1119               if (c1 == ISO_CODE_LF)
1120                 *dst++ = '\n';
1121               else
1122                 {
1123                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1124                     {
1125                       result = CODING_FINISH_INCONSISTENT_EOL;
1126                       goto label_end_of_loop_2;
1127                     }
1128                   src--;
1129                   *dst++ = '\r';
1130                 }
1131             }
1132           else
1133             *dst++ = c1;
1134           coding->produced_char++;
1135           break;
1136
1137         case ISO_shift_out:
1138           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1139               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1140             goto label_invalid_code;
1141           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1142           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1143           break;
1144
1145         case ISO_shift_in:
1146           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1147             goto label_invalid_code;
1148           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1149           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1150           break;
1151
1152         case ISO_single_shift_2_7:
1153         case ISO_single_shift_2:
1154           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1155             goto label_invalid_code;
1156           /* SS2 is handled as an escape sequence of ESC 'N' */
1157           c1 = 'N';
1158           goto label_escape_sequence;
1159
1160         case ISO_single_shift_3:
1161           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1162             goto label_invalid_code;
1163           /* SS2 is handled as an escape sequence of ESC 'O' */
1164           c1 = 'O';
1165           goto label_escape_sequence;
1166
1167         case ISO_control_sequence_introducer:
1168           /* CSI is handled as an escape sequence of ESC '[' ...  */
1169           c1 = '[';
1170           goto label_escape_sequence;
1171
1172         case ISO_escape:
1173           ONE_MORE_BYTE (c1);
1174         label_escape_sequence:
1175           /* Escape sequences handled by Emacs are invocation,
1176              designation, direction specification, and character
1177              composition specification.  */
1178           switch (c1)
1179             {
1180             case '&':           /* revision of following character set */
1181               ONE_MORE_BYTE (c1);
1182               if (!(c1 >= '@' && c1 <= '~'))
1183                 goto label_invalid_code;
1184               ONE_MORE_BYTE (c1);
1185               if (c1 != ISO_CODE_ESC)
1186                 goto label_invalid_code;
1187               ONE_MORE_BYTE (c1);
1188               goto label_escape_sequence;
1189
1190             case '$':           /* designation of 2-byte character set */
1191               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1192                 goto label_invalid_code;
1193               ONE_MORE_BYTE (c1);
1194               if (c1 >= '@' && c1 <= 'B')
1195                 {       /* designation of JISX0208.1978, GB2312.1980,
1196                            or JISX0208.1980 */
1197                   DECODE_DESIGNATION (0, 2, 94, c1);
1198                 }
1199               else if (c1 >= 0x28 && c1 <= 0x2B)
1200                 {       /* designation of DIMENSION2_CHARS94 character set */
1201                   ONE_MORE_BYTE (c2);
1202                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1203                 }
1204               else if (c1 >= 0x2C && c1 <= 0x2F)
1205                 {       /* designation of DIMENSION2_CHARS96 character set */
1206                   ONE_MORE_BYTE (c2);
1207                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1208                 }
1209               else
1210                 goto label_invalid_code;
1211               break;
1212
1213             case 'n':           /* invocation of locking-shift-2 */
1214               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1215                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1216                 goto label_invalid_code;
1217               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1218               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1219               break;
1220
1221             case 'o':           /* invocation of locking-shift-3 */
1222               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1223                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1224                 goto label_invalid_code;
1225               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1226               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1227               break;
1228
1229             case 'N':           /* invocation of single-shift-2 */
1230               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1231                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1232                 goto label_invalid_code;
1233               ONE_MORE_BYTE (c1);
1234               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1235               DECODE_ISO_CHARACTER (charset, c1);
1236               break;
1237
1238             case 'O':           /* invocation of single-shift-3 */
1239               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1240                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1241                 goto label_invalid_code;
1242               ONE_MORE_BYTE (c1);
1243               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1244               DECODE_ISO_CHARACTER (charset, c1);
1245               break;
1246
1247             case '0': case '2': /* start composing */
1248               /* Before processing composing, we must be sure that all
1249                  characters being composed are supported by CODING.
1250                  If not, we must give up composing.  */
1251               if (check_composing_code (coding, src, src_end) == 0)
1252                 {
1253                   /* We are looking at a valid composition sequence.  */
1254                   coding->composing = (c1 == '0'
1255                                        ? COMPOSING_NO_RULE_HEAD
1256                                        : COMPOSING_WITH_RULE_HEAD);
1257                   coding->composed_chars = 0;
1258                 }
1259               else
1260                 {
1261                   *dst++ = ISO_CODE_ESC;
1262                   *dst++ = c1;
1263                   coding->produced_char += 2;
1264                 }
1265               break;
1266
1267             case '1':           /* end composing */
1268               if (!coding->composing)
1269                 {
1270                   *dst++ = ISO_CODE_ESC;
1271                   *dst++ = c1;
1272                   coding->produced_char += 2;
1273                   break;
1274                 }
1275
1276               if (coding->composed_chars > 0)
1277                 {
1278                   if (coding->composed_chars == 1)
1279                     {
1280                       unsigned char *this_char_start = dst;
1281                       int this_bytes;
1282
1283                       /* Only one character is in the composing
1284                          sequence.  Make it a normal character.  */
1285                       while (*--this_char_start != LEADING_CODE_COMPOSITION);
1286                       dst = (this_char_start
1287                              + (coding->composing == COMPOSING_NO_RULE_TAIL
1288                                 ? 1 : 2));
1289                       *dst -= 0x20;
1290                       if (*dst == 0x80)
1291                         *++dst &= 0x7F;
1292                       this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1293                       while (this_bytes--) *this_char_start++ = *dst++;
1294                       dst = this_char_start;
1295                     }
1296                   coding->produced_char++;
1297                 }
1298               coding->composing = COMPOSING_NO;
1299               break;
1300
1301             case '[':           /* specification of direction */
1302               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1303                 goto label_invalid_code;
1304               /* For the moment, nested direction is not supported.
1305                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1306                  left-to-right, and nozero means right-to-left.  */
1307               ONE_MORE_BYTE (c1);
1308               switch (c1)
1309                 {
1310                 case ']':       /* end of the current direction */
1311                   coding->mode &= ~CODING_MODE_DIRECTION;
1312
1313                 case '0':       /* end of the current direction */
1314                 case '1':       /* start of left-to-right direction */
1315                   ONE_MORE_BYTE (c1);
1316                   if (c1 == ']')
1317                     coding->mode &= ~CODING_MODE_DIRECTION;
1318                   else
1319                     goto label_invalid_code;
1320                   break;
1321
1322                 case '2':       /* start of right-to-left direction */
1323                   ONE_MORE_BYTE (c1);
1324                   if (c1 == ']')
1325                     coding->mode |= CODING_MODE_DIRECTION;
1326                   else
1327                     goto label_invalid_code;
1328                   break;
1329
1330                 default:
1331                   goto label_invalid_code;
1332                 }
1333               break;
1334
1335             default:
1336               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1337                 goto label_invalid_code;
1338               if (c1 >= 0x28 && c1 <= 0x2B)
1339                 {       /* designation of DIMENSION1_CHARS94 character set */
1340                   ONE_MORE_BYTE (c2);
1341                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1342                 }
1343               else if (c1 >= 0x2C && c1 <= 0x2F)
1344                 {       /* designation of DIMENSION1_CHARS96 character set */
1345                   ONE_MORE_BYTE (c2);
1346                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1347                 }
1348               else
1349                 {
1350                   goto label_invalid_code;
1351                 }
1352             }
1353           /* We must update these variables now.  */
1354           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1355           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1356           break;
1357
1358         label_invalid_code:
1359           while (src_base < src)
1360             *dst++ = *src_base++;
1361           coding->fake_multibyte = 1;
1362         }
1363       continue;
1364
1365     label_end_of_loop:
1366       result = CODING_FINISH_INSUFFICIENT_SRC;
1367     label_end_of_loop_2:
1368       src = src_base;
1369       break;
1370     }
1371
1372   if (src < src_end)
1373     {
1374       if (result == CODING_FINISH_NORMAL)
1375         result = CODING_FINISH_INSUFFICIENT_DST;
1376       else if (result != CODING_FINISH_INCONSISTENT_EOL
1377                && coding->mode & CODING_MODE_LAST_BLOCK)
1378         {
1379           /* This is the last block of the text to be decoded.  We had
1380              better just flush out all remaining codes in the text
1381              although they are not valid characters.  */
1382           src_bytes = src_end - src;
1383           if (dst_bytes && (dst_end - dst < src_bytes))
1384             src_bytes = dst_end - dst;
1385           bcopy (src, dst, src_bytes);
1386           dst += src_bytes;
1387           src += src_bytes;
1388           coding->fake_multibyte = 1;
1389         }
1390     }
1391
1392   coding->consumed = coding->consumed_char = src - source;
1393   coding->produced = dst - destination;
1394   return result;
1395 }
1396
1397 /* ISO2022 encoding stuff.  */
1398
1399 /*
1400    It is not enough to say just "ISO2022" on encoding, we have to
1401    specify more details.  In Emacs, each coding system of ISO2022
1402    variant has the following specifications:
1403         1. Initial designation to G0 thru G3.
1404         2. Allows short-form designation?
1405         3. ASCII should be designated to G0 before control characters?
1406         4. ASCII should be designated to G0 at end of line?
1407         5. 7-bit environment or 8-bit environment?
1408         6. Use locking-shift?
1409         7. Use Single-shift?
1410    And the following two are only for Japanese:
1411         8. Use ASCII in place of JIS0201-1976-Roman?
1412         9. Use JISX0208-1983 in place of JISX0208-1978?
1413    These specifications are encoded in `coding->flags' as flag bits
1414    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1415    details.
1416 */
1417
1418 /* Produce codes (escape sequence) for designating CHARSET to graphic
1419    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1420    the coding system CODING allows, produce designation sequence of
1421    short-form.  */
1422
1423 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1424   do {                                                                  \
1425     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1426     char *intermediate_char_94 = "()*+";                                \
1427     char *intermediate_char_96 = ",-./";                                \
1428     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1429     if (revision < 255)                                                 \
1430       {                                                                 \
1431         *dst++ = ISO_CODE_ESC;                                          \
1432         *dst++ = '&';                                                   \
1433         *dst++ = '@' + revision;                                        \
1434       }                                                                 \
1435     *dst++ = ISO_CODE_ESC;                                              \
1436     if (CHARSET_DIMENSION (charset) == 1)                               \
1437       {                                                                 \
1438         if (CHARSET_CHARS (charset) == 94)                              \
1439           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1440         else                                                            \
1441           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1442       }                                                                 \
1443     else                                                                \
1444       {                                                                 \
1445         *dst++ = '$';                                                   \
1446         if (CHARSET_CHARS (charset) == 94)                              \
1447           {                                                             \
1448             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1449                 || reg != 0                                             \
1450                 || final_char < '@' || final_char > 'B')                \
1451               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1452           }                                                             \
1453         else                                                            \
1454           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1455       }                                                                 \
1456     *dst++ = final_char;                                                \
1457     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1458   } while (0)
1459
1460 /* The following two macros produce codes (control character or escape
1461    sequence) for ISO2022 single-shift functions (single-shift-2 and
1462    single-shift-3).  */
1463
1464 #define ENCODE_SINGLE_SHIFT_2                           \
1465   do {                                                  \
1466     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1467       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1468     else                                                \
1469       {                                                 \
1470         *dst++ = ISO_CODE_SS2;                          \
1471         coding->fake_multibyte = 1;                     \
1472       }                                                 \
1473     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1474   } while (0)
1475
1476 #define ENCODE_SINGLE_SHIFT_3                           \
1477   do {                                                  \
1478     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1479       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1480     else                                                \
1481       {                                                 \
1482         *dst++ = ISO_CODE_SS3;                          \
1483         coding->fake_multibyte = 1;                     \
1484       }                                                 \
1485     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1486   } while (0)
1487
1488 /* The following four macros produce codes (control character or
1489    escape sequence) for ISO2022 locking-shift functions (shift-in,
1490    shift-out, locking-shift-2, and locking-shift-3).  */
1491
1492 #define ENCODE_SHIFT_IN                         \
1493   do {                                          \
1494     *dst++ = ISO_CODE_SI;                       \
1495     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1496   } while (0)
1497
1498 #define ENCODE_SHIFT_OUT                        \
1499   do {                                          \
1500     *dst++ = ISO_CODE_SO;                       \
1501     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1502   } while (0)
1503
1504 #define ENCODE_LOCKING_SHIFT_2                  \
1505   do {                                          \
1506     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1507     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1508   } while (0)
1509
1510 #define ENCODE_LOCKING_SHIFT_3                  \
1511   do {                                          \
1512     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1513     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1514   } while (0)
1515
1516 /* Produce codes for a DIMENSION1 character whose character set is
1517    CHARSET and whose position-code is C1.  Designation and invocation
1518    sequences are also produced in advance if necessary.  */
1519
1520
1521 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1522   do {                                                                  \
1523     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1524       {                                                                 \
1525         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1526           *dst++ = c1 & 0x7F;                                           \
1527         else                                                            \
1528           *dst++ = c1 | 0x80;                                           \
1529         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1530         break;                                                          \
1531       }                                                                 \
1532     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1533       {                                                                 \
1534         *dst++ = c1 & 0x7F;                                             \
1535         break;                                                          \
1536       }                                                                 \
1537     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1538       {                                                                 \
1539         *dst++ = c1 | 0x80;                                             \
1540         break;                                                          \
1541       }                                                                 \
1542     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1543              && !coding->safe_charsets[charset])                        \
1544       {                                                                 \
1545         /* We should not encode this character, instead produce one or  \
1546            two `?'s.  */                                                \
1547         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1548         if (CHARSET_WIDTH (charset) == 2)                               \
1549           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1550         break;                                                          \
1551       }                                                                 \
1552     else                                                                \
1553       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1554          must invoke it, or, at first, designate it to some graphic     \
1555          register.  Then repeat the loop to actually produce the        \
1556          character.  */                                                 \
1557       dst = encode_invocation_designation (charset, coding, dst);       \
1558   } while (1)
1559
1560 /* Produce codes for a DIMENSION2 character whose character set is
1561    CHARSET and whose position-codes are C1 and C2.  Designation and
1562    invocation codes are also produced in advance if necessary.  */
1563
1564 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1565   do {                                                                  \
1566     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1567       {                                                                 \
1568         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1569           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1570         else                                                            \
1571           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1572         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1573         break;                                                          \
1574       }                                                                 \
1575     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1576       {                                                                 \
1577         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1578         break;                                                          \
1579       }                                                                 \
1580     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1581       {                                                                 \
1582         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1583         break;                                                          \
1584       }                                                                 \
1585     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1586              && !coding->safe_charsets[charset])                        \
1587       {                                                                 \
1588         /* We should not encode this character, instead produce one or  \
1589            two `?'s.  */                                                \
1590         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1591         if (CHARSET_WIDTH (charset) == 2)                               \
1592           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1593         break;                                                          \
1594       }                                                                 \
1595     else                                                                \
1596       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1597          must invoke it, or, at first, designate it to some graphic     \
1598          register.  Then repeat the loop to actually produce the        \
1599          character.  */                                                 \
1600       dst = encode_invocation_designation (charset, coding, dst);       \
1601   } while (1)
1602
1603 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                   \
1604   do {                                                          \
1605     int c_alt, charset_alt;                                     \
1606     if (!NILP (translation_table)                               \
1607         && ((c_alt = translate_char (translation_table, -1,     \
1608                                      charset, c1, c2))          \
1609             >= 0))                                              \
1610       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                  \
1611     else                                                        \
1612       charset_alt = charset;                                    \
1613     if (CHARSET_DIMENSION (charset_alt) == 1)                   \
1614       {                                                         \
1615         if (charset == CHARSET_ASCII                            \
1616             && coding->flags & CODING_FLAG_ISO_USE_ROMAN)       \
1617           charset_alt = charset_latin_jisx0201;                 \
1618         ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);      \
1619       }                                                         \
1620     else                                                        \
1621       {                                                         \
1622         if (charset == charset_jisx0208                         \
1623             && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)      \
1624           charset_alt = charset_jisx0208_1978;                  \
1625         ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);  \
1626       }                                                         \
1627     if (! COMPOSING_P (coding->composing))                      \
1628       coding->consumed_char++;                                  \
1629   } while (0)
1630
1631 /* Produce designation and invocation codes at a place pointed by DST
1632    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1633    Return new DST.  */
1634
1635 unsigned char *
1636 encode_invocation_designation (charset, coding, dst)
1637      int charset;
1638      struct coding_system *coding;
1639      unsigned char *dst;
1640 {
1641   int reg;                      /* graphic register number */
1642
1643   /* At first, check designations.  */
1644   for (reg = 0; reg < 4; reg++)
1645     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1646       break;
1647
1648   if (reg >= 4)
1649     {
1650       /* CHARSET is not yet designated to any graphic registers.  */
1651       /* At first check the requested designation.  */
1652       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1653       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1654         /* Since CHARSET requests no special designation, designate it
1655            to graphic register 0.  */
1656         reg = 0;
1657
1658       ENCODE_DESIGNATION (charset, reg, coding);
1659     }
1660
1661   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1662       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1663     {
1664       /* Since the graphic register REG is not invoked to any graphic
1665          planes, invoke it to graphic plane 0.  */
1666       switch (reg)
1667         {
1668         case 0:                 /* graphic register 0 */
1669           ENCODE_SHIFT_IN;
1670           break;
1671
1672         case 1:                 /* graphic register 1 */
1673           ENCODE_SHIFT_OUT;
1674           break;
1675
1676         case 2:                 /* graphic register 2 */
1677           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1678             ENCODE_SINGLE_SHIFT_2;
1679           else
1680             ENCODE_LOCKING_SHIFT_2;
1681           break;
1682
1683         case 3:                 /* graphic register 3 */
1684           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1685             ENCODE_SINGLE_SHIFT_3;
1686           else
1687             ENCODE_LOCKING_SHIFT_3;
1688           break;
1689         }
1690     }
1691   return dst;
1692 }
1693
1694 /* The following two macros produce codes for indicating composition.  */
1695 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1696 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1697 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1698
1699 /* The following three macros produce codes for indicating direction
1700    of text.  */
1701 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1702   do {                                                  \
1703     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1704       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1705     else                                                \
1706       *dst++ = ISO_CODE_CSI;                            \
1707   } while (0)
1708
1709 #define ENCODE_DIRECTION_R2L    \
1710   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1711
1712 #define ENCODE_DIRECTION_L2R    \
1713   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1714
1715 /* Produce codes for designation and invocation to reset the graphic
1716    planes and registers to initial state.  */
1717 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1718   do {                                                                      \
1719     int reg;                                                                \
1720     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1721       ENCODE_SHIFT_IN;                                                      \
1722     for (reg = 0; reg < 4; reg++)                                           \
1723       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1724           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1725               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1726         ENCODE_DESIGNATION                                                  \
1727           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1728   } while (0)
1729
1730 /* Produce designation sequences of charsets in the line started from
1731    SRC to a place pointed by *DSTP, and update DSTP.
1732
1733    If the current block ends before any end-of-line, we may fail to
1734    find all the necessary designations.  */
1735
1736 void
1737 encode_designation_at_bol (coding, table, src, src_end, dstp)
1738      struct coding_system *coding;
1739      Lisp_Object table;
1740      unsigned char *src, *src_end, **dstp;
1741 {
1742   int charset, c, found = 0, reg;
1743   /* Table of charsets to be designated to each graphic register.  */
1744   int r[4];
1745   unsigned char *dst = *dstp;
1746
1747   for (reg = 0; reg < 4; reg++)
1748     r[reg] = -1;
1749
1750   while (src < src_end && *src != '\n' && found < 4)
1751     {
1752       int bytes = BYTES_BY_CHAR_HEAD (*src);
1753
1754       if (NILP (table))
1755         charset = CHARSET_AT (src);
1756       else
1757         {
1758           int c_alt;
1759           unsigned char c1, c2;
1760
1761           SPLIT_STRING(src, bytes, charset, c1, c2);
1762           if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
1763             charset = CHAR_CHARSET (c_alt);
1764         }
1765
1766       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1767       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1768         {
1769           found++;
1770           r[reg] = charset;
1771         }
1772
1773       src += bytes;
1774     }
1775
1776   if (found)
1777     {
1778       for (reg = 0; reg < 4; reg++)
1779         if (r[reg] >= 0
1780             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1781           ENCODE_DESIGNATION (r[reg], reg, coding);
1782       *dstp = dst;
1783     }
1784 }
1785
1786 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1787
1788 int
1789 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1790      struct coding_system *coding;
1791      unsigned char *source, *destination;
1792      int src_bytes, dst_bytes;
1793 {
1794   unsigned char *src = source;
1795   unsigned char *src_end = source + src_bytes;
1796   unsigned char *dst = destination;
1797   unsigned char *dst_end = destination + dst_bytes;
1798   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1799      from DST_END to assure overflow checking is necessary only at the
1800      head of loop.  */
1801   unsigned char *adjusted_dst_end = dst_end - 19;
1802   Lisp_Object translation_table
1803       = coding->translation_table_for_encode;
1804   int result = CODING_FINISH_NORMAL;
1805
1806   if (!NILP (Venable_character_translation) && NILP (translation_table))
1807     translation_table = Vstandard_translation_table_for_encode;
1808
1809   coding->consumed_char = 0;
1810   coding->fake_multibyte = 0;
1811   while (src < src_end && (dst_bytes
1812                            ? (dst < adjusted_dst_end)
1813                            : (dst < src - 19)))
1814     {
1815       /* SRC_BASE remembers the start position in source in each loop.
1816          The loop will be exited when there's not enough source text
1817          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1818          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1819          reset to SRC_BASE before exiting.  */
1820       unsigned char *src_base = src;
1821       int charset, c1, c2, c3, c4;
1822
1823       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1824           && CODING_SPEC_ISO_BOL (coding))
1825         {
1826           /* We have to produce designation sequences if any now.  */
1827           encode_designation_at_bol (coding, translation_table,
1828                                      src, src_end, &dst);
1829           CODING_SPEC_ISO_BOL (coding) = 0;
1830         }
1831
1832       c1 = *src++;
1833       /* If we are seeing a component of a composite character, we are
1834          seeing a leading-code encoded irregularly for composition, or
1835          a composition rule if composing with rule.  We must set C1 to
1836          a normal leading-code or an ASCII code.  If we are not seeing
1837          a composite character, we must reset composition,
1838          designation, and invocation states.  */
1839       if (COMPOSING_P (coding->composing))
1840         {
1841           if (c1 < 0xA0)
1842             {
1843               /* We are not in a composite character any longer.  */
1844               coding->composing = COMPOSING_NO;
1845               ENCODE_RESET_PLANE_AND_REGISTER;
1846               ENCODE_COMPOSITION_END;
1847             }
1848           else
1849             {
1850               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1851                 {
1852                   *dst++ = c1 & 0x7F;
1853                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1854                   continue;
1855                 }
1856               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1857                 coding->composing = COMPOSING_WITH_RULE_RULE;
1858               if (c1 == 0xA0)
1859                 {
1860                   /* This is an ASCII component.  */
1861                   ONE_MORE_BYTE (c1);
1862                   c1 &= 0x7F;
1863                 }
1864               else
1865                 /* This is a leading-code of non ASCII component.  */
1866                 c1 -= 0x20;
1867             }
1868         }
1869
1870       /* Now encode one character.  C1 is a control character, an
1871          ASCII character, or a leading-code of multi-byte character.  */
1872       switch (emacs_code_class[c1])
1873         {
1874         case EMACS_ascii_code:
1875           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1876           break;
1877
1878         case EMACS_control_code:
1879           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1880             ENCODE_RESET_PLANE_AND_REGISTER;
1881           *dst++ = c1;
1882           coding->consumed_char++;
1883           break;
1884
1885         case EMACS_carriage_return_code:
1886           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
1887             {
1888               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1889                 ENCODE_RESET_PLANE_AND_REGISTER;
1890               *dst++ = c1;
1891               coding->consumed_char++;
1892               break;
1893             }
1894           /* fall down to treat '\r' as '\n' ...  */
1895
1896         case EMACS_linefeed_code:
1897           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1898             ENCODE_RESET_PLANE_AND_REGISTER;
1899           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1900             bcopy (coding->spec.iso2022.initial_designation,
1901                    coding->spec.iso2022.current_designation,
1902                    sizeof coding->spec.iso2022.initial_designation);
1903           if (coding->eol_type == CODING_EOL_LF
1904               || coding->eol_type == CODING_EOL_UNDECIDED)
1905             *dst++ = ISO_CODE_LF;
1906           else if (coding->eol_type == CODING_EOL_CRLF)
1907             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1908           else
1909             *dst++ = ISO_CODE_CR;
1910           CODING_SPEC_ISO_BOL (coding) = 1;
1911           coding->consumed_char++;
1912           break;
1913
1914         case EMACS_leading_code_2:
1915           ONE_MORE_BYTE (c2);
1916           if (c2 < 0xA0)
1917             {
1918               /* invalid sequence */
1919               *dst++ = c1;
1920               src--;
1921               coding->consumed_char++;
1922             }
1923           else
1924             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1925           break;
1926
1927         case EMACS_leading_code_3:
1928           TWO_MORE_BYTES (c2, c3);
1929           if (c2 < 0xA0 || c3 < 0xA0)
1930             {
1931               /* invalid sequence */
1932               *dst++ = c1;
1933               src -= 2;
1934               coding->consumed_char++;
1935             }
1936           else if (c1 < LEADING_CODE_PRIVATE_11)
1937             ENCODE_ISO_CHARACTER (c1, c2, c3);
1938           else
1939             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1940           break;
1941
1942         case EMACS_leading_code_4:
1943           THREE_MORE_BYTES (c2, c3, c4);
1944           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1945             {
1946               /* invalid sequence */
1947               *dst++ = c1;
1948               src -= 3;
1949               coding->consumed_char++;
1950             }
1951           else
1952             ENCODE_ISO_CHARACTER (c2, c3, c4);
1953           break;
1954
1955         case EMACS_leading_code_composition:
1956           ONE_MORE_BYTE (c2);
1957           if (c2 < 0xA0)
1958             {
1959               /* invalid sequence */
1960               *dst++ = c1;
1961               src--;
1962               coding->consumed_char++;
1963             }
1964           else if (c2 == 0xFF)
1965             {
1966               ENCODE_RESET_PLANE_AND_REGISTER;
1967               coding->composing = COMPOSING_WITH_RULE_HEAD;
1968               ENCODE_COMPOSITION_WITH_RULE_START;
1969               coding->consumed_char++;
1970             }
1971           else
1972             {
1973               ENCODE_RESET_PLANE_AND_REGISTER;
1974               /* Rewind one byte because it is a character code of
1975                  composition elements.  */
1976               src--;
1977               coding->composing = COMPOSING_NO_RULE_HEAD;
1978               ENCODE_COMPOSITION_NO_RULE_START;
1979               coding->consumed_char++;
1980             }
1981           break;
1982
1983         case EMACS_invalid_code:
1984           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1985             ENCODE_RESET_PLANE_AND_REGISTER;
1986           *dst++ = c1;
1987           coding->consumed_char++;
1988           break;
1989         }
1990       continue;
1991     label_end_of_loop:
1992       result = CODING_FINISH_INSUFFICIENT_SRC;
1993       src = src_base;
1994       break;
1995     }
1996
1997   if (src < src_end && result == CODING_FINISH_NORMAL)
1998     result = CODING_FINISH_INSUFFICIENT_DST;
1999
2000   /* If this is the last block of the text to be encoded, we must
2001      reset graphic planes and registers to the initial state, and
2002      flush out the carryover if any.  */
2003   if (coding->mode & CODING_MODE_LAST_BLOCK)
2004     {
2005       ENCODE_RESET_PLANE_AND_REGISTER;
2006       if (COMPOSING_P (coding->composing))
2007         ENCODE_COMPOSITION_END;
2008       if (result == CODING_FINISH_INSUFFICIENT_SRC)
2009         {
2010           while (src < src_end && dst < dst_end)
2011             *dst++ = *src++;
2012         }
2013     }
2014   coding->consumed = src - source;
2015   coding->produced = coding->produced_char = dst - destination;
2016   return result;
2017 }
2018
2019 \f
2020 /*** 4. SJIS and BIG5 handlers ***/
2021
2022 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2023    quite widely.  So, for the moment, Emacs supports them in the bare
2024    C code.  But, in the future, they may be supported only by CCL.  */
2025
2026 /* SJIS is a coding system encoding three character sets: ASCII, right
2027    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2028    as is.  A character of charset katakana-jisx0201 is encoded by
2029    "position-code + 0x80".  A character of charset japanese-jisx0208
2030    is encoded in 2-byte but two position-codes are divided and shifted
2031    so that it fit in the range below.
2032
2033    --- CODE RANGE of SJIS ---
2034    (character set)      (range)
2035    ASCII                0x00 .. 0x7F
2036    KATAKANA-JISX0201    0xA0 .. 0xDF
2037    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2038             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2039    -------------------------------
2040
2041 */
2042
2043 /* BIG5 is a coding system encoding two character sets: ASCII and
2044    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2045    character set and is encoded in two-byte.
2046
2047    --- CODE RANGE of BIG5 ---
2048    (character set)      (range)
2049    ASCII                0x00 .. 0x7F
2050    Big5 (1st byte)      0xA1 .. 0xFE
2051         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2052    --------------------------
2053
2054    Since the number of characters in Big5 is larger than maximum
2055    characters in Emacs' charset (96x96), it can't be handled as one
2056    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2057    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2058    contains frequently used characters and the latter contains less
2059    frequently used characters.  */
2060
2061 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2062    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2063    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2064    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2065
2066 /* Number of Big5 characters which have the same code in 1st byte.  */
2067 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2068
2069 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2070   do {                                                                  \
2071     unsigned int temp                                                   \
2072       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2073     if (b1 < 0xC9)                                                      \
2074       charset = charset_big5_1;                                         \
2075     else                                                                \
2076       {                                                                 \
2077         charset = charset_big5_2;                                       \
2078         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2079       }                                                                 \
2080     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2081     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2082   } while (0)
2083
2084 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2085   do {                                                                  \
2086     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2087     if (charset == charset_big5_2)                                      \
2088       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2089     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2090     b2 = temp % BIG5_SAME_ROW;                                          \
2091     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2092   } while (0)
2093
2094 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
2095   do {                                                                  \
2096     int c_alt, charset_alt = (charset);                                 \
2097     if (!NILP (translation_table)                                       \
2098         && ((c_alt = translate_char (translation_table,                 \
2099                                      -1, (charset), c1, c2)) >= 0))     \
2100       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                          \
2101     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
2102       DECODE_CHARACTER_ASCII (c1);                                      \
2103     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
2104       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
2105     else                                                                \
2106       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
2107   } while (0)
2108
2109 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)             \
2110   do {                                                          \
2111     int c_alt, charset_alt;                                     \
2112     if (!NILP (translation_table)                               \
2113         && ((c_alt = translate_char (translation_table, -1,     \
2114                                      charset, c1, c2))          \
2115             >= 0))                                              \
2116       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                  \
2117     else                                                        \
2118       charset_alt = charset;                                    \
2119     if (charset_alt == charset_ascii)                           \
2120       *dst++ = c1;                                              \
2121     else if (CHARSET_DIMENSION (charset_alt) == 1)              \
2122       {                                                         \
2123         if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2124           *dst++ = c1;                                          \
2125         else                                                    \
2126           {                                                     \
2127             *dst++ = charset_alt, *dst++ = c1;                  \
2128             coding->fake_multibyte = 1;                         \
2129           }                                                     \
2130       }                                                         \
2131     else                                                        \
2132       {                                                         \
2133         c1 &= 0x7F, c2 &= 0x7F;                                 \
2134         if (sjis_p && charset_alt == charset_jisx0208)          \
2135           {                                                     \
2136             unsigned char s1, s2;                               \
2137                                                                 \
2138             ENCODE_SJIS (c1, c2, s1, s2);                       \
2139             *dst++ = s1, *dst++ = s2;                           \
2140             coding->fake_multibyte = 1;                         \
2141           }                                                     \
2142         else if (!sjis_p                                        \
2143                  && (charset_alt == charset_big5_1              \
2144                      || charset_alt == charset_big5_2))         \
2145           {                                                     \
2146             unsigned char b1, b2;                               \
2147                                                                 \
2148             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);          \
2149             *dst++ = b1, *dst++ = b2;                           \
2150           }                                                     \
2151         else                                                    \
2152           {                                                     \
2153             *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;     \
2154             coding->fake_multibyte = 1;                         \
2155           }                                                     \
2156       }                                                         \
2157     coding->consumed_char++;                                    \
2158   } while (0);
2159
2160 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2161    Check if a text is encoded in SJIS.  If it is, return
2162    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2163
2164 int
2165 detect_coding_sjis (src, src_end)
2166      unsigned char *src, *src_end;
2167 {
2168   unsigned char c;
2169
2170   while (src < src_end)
2171     {
2172       c = *src++;
2173       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2174         {
2175           if (src < src_end && *src++ < 0x40)
2176             return 0;
2177         }
2178     }
2179   return CODING_CATEGORY_MASK_SJIS;
2180 }
2181
2182 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2183    Check if a text is encoded in BIG5.  If it is, return
2184    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2185
2186 int
2187 detect_coding_big5 (src, src_end)
2188      unsigned char *src, *src_end;
2189 {
2190   unsigned char c;
2191
2192   while (src < src_end)
2193     {
2194       c = *src++;
2195       if (c >= 0xA1)
2196         {
2197           if (src >= src_end)
2198             break;
2199           c = *src++;
2200           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2201             return 0;
2202         }
2203     }
2204   return CODING_CATEGORY_MASK_BIG5;
2205 }
2206
2207 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2208    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2209
2210 int
2211 decode_coding_sjis_big5 (coding, source, destination,
2212                          src_bytes, dst_bytes, sjis_p)
2213      struct coding_system *coding;
2214      unsigned char *source, *destination;
2215      int src_bytes, dst_bytes;
2216      int sjis_p;
2217 {
2218   unsigned char *src = source;
2219   unsigned char *src_end = source + src_bytes;
2220   unsigned char *dst = destination;
2221   unsigned char *dst_end = destination + dst_bytes;
2222   /* Since the maximum bytes produced by each loop is 4, we subtract 3
2223      from DST_END to assure overflow checking is necessary only at the
2224      head of loop.  */
2225   unsigned char *adjusted_dst_end = dst_end - 3;
2226   Lisp_Object translation_table
2227       = coding->translation_table_for_decode;
2228   int result = CODING_FINISH_NORMAL;
2229
2230   if (!NILP (Venable_character_translation) && NILP (translation_table))
2231     translation_table = Vstandard_translation_table_for_decode;
2232
2233   coding->produced_char = 0;
2234   coding->fake_multibyte = 0;
2235   while (src < src_end && (dst_bytes
2236                            ? (dst < adjusted_dst_end)
2237                            : (dst < src - 3)))
2238     {
2239       /* SRC_BASE remembers the start position in source in each loop.
2240          The loop will be exited when there's not enough source text
2241          to analyze two-byte character (within macro ONE_MORE_BYTE).
2242          In that case, SRC is reset to SRC_BASE before exiting.  */
2243       unsigned char *src_base = src;
2244       unsigned char c1 = *src++, c2, c3, c4;
2245
2246       if (c1 < 0x20)
2247         {
2248           if (c1 == '\r')
2249             {
2250               if (coding->eol_type == CODING_EOL_CRLF)
2251                 {
2252                   ONE_MORE_BYTE (c2);
2253                   if (c2 == '\n')
2254                     *dst++ = c2;
2255                   else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2256                     {
2257                       result = CODING_FINISH_INCONSISTENT_EOL;
2258                       goto label_end_of_loop_2;
2259                     }
2260                   else
2261                     /* To process C2 again, SRC is subtracted by 1.  */
2262                     *dst++ = c1, src--;
2263                 }
2264               else if (coding->eol_type == CODING_EOL_CR)
2265                 *dst++ = '\n';
2266               else
2267                 *dst++ = c1;
2268             }
2269           else if (c1 == '\n'
2270                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2271                    && (coding->eol_type == CODING_EOL_CR
2272                        || coding->eol_type == CODING_EOL_CRLF))
2273             {
2274               result = CODING_FINISH_INCONSISTENT_EOL;
2275               goto label_end_of_loop_2;
2276             }
2277           else
2278             *dst++ = c1;
2279           coding->produced_char++;
2280         }
2281       else if (c1 < 0x80)
2282         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2283       else
2284         {
2285           if (sjis_p)
2286             {
2287               if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
2288                 {
2289                   /* SJIS -> JISX0208 */
2290                   ONE_MORE_BYTE (c2);
2291                   if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
2292                     {
2293                       DECODE_SJIS (c1, c2, c3, c4);
2294                       DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2295                     }
2296                   else
2297                     goto label_invalid_code_2;
2298                 }
2299               else if (c1 < 0xE0)
2300                 /* SJIS -> JISX0201-Kana */
2301                 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2302                                             /* dummy */ c2);
2303               else
2304                 goto label_invalid_code_1;
2305             }
2306           else
2307             {
2308               /* BIG5 -> Big5 */
2309               if (c1 >= 0xA1 && c1 <= 0xFE)
2310                 {
2311                   ONE_MORE_BYTE (c2);
2312                   if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2313                     {
2314                       int charset;
2315
2316                       DECODE_BIG5 (c1, c2, charset, c3, c4);
2317                       DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2318                     }
2319                   else
2320                     goto label_invalid_code_2;
2321                 }
2322               else
2323                 goto label_invalid_code_1;
2324             }
2325         }
2326       continue;
2327
2328     label_invalid_code_1:
2329       *dst++ = c1;
2330       coding->produced_char++;
2331       coding->fake_multibyte = 1;
2332       continue;
2333
2334     label_invalid_code_2:
2335       *dst++ = c1; *dst++= c2;
2336       coding->produced_char += 2;
2337       coding->fake_multibyte = 1;
2338       continue;
2339
2340     label_end_of_loop:
2341       result = CODING_FINISH_INSUFFICIENT_SRC;
2342     label_end_of_loop_2:
2343       src = src_base;
2344       break;
2345     }
2346
2347   if (src < src_end)
2348     {
2349       if (result == CODING_FINISH_NORMAL)
2350         result = CODING_FINISH_INSUFFICIENT_DST;
2351       else if (result != CODING_FINISH_INCONSISTENT_EOL
2352                && coding->mode & CODING_MODE_LAST_BLOCK)
2353         {
2354           src_bytes = src_end - src;
2355           if (dst_bytes && (dst_end - dst < src_bytes))
2356             src_bytes = dst_end - dst;
2357           bcopy (dst, src, src_bytes);
2358           src += src_bytes;
2359           dst += src_bytes;
2360           coding->fake_multibyte = 1;
2361         }
2362     }
2363
2364   coding->consumed = coding->consumed_char = src - source;
2365   coding->produced = dst - destination;
2366   return result;
2367 }
2368
2369 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2370    This function can encode `charset_ascii', `charset_katakana_jisx0201',
2371    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
2372    sure that all these charsets are registered as official charset
2373    (i.e. do not have extended leading-codes).  Characters of other
2374    charsets are produced without any encoding.  If SJIS_P is 1, encode
2375    SJIS text, else encode BIG5 text.  */
2376
2377 int
2378 encode_coding_sjis_big5 (coding, source, destination,
2379                          src_bytes, dst_bytes, sjis_p)
2380      struct coding_system *coding;
2381      unsigned char *source, *destination;
2382      int src_bytes, dst_bytes;
2383      int sjis_p;
2384 {
2385   unsigned char *src = source;
2386   unsigned char *src_end = source + src_bytes;
2387   unsigned char *dst = destination;
2388   unsigned char *dst_end = destination + dst_bytes;
2389   /* Since the maximum bytes produced by each loop is 2, we subtract 1
2390      from DST_END to assure overflow checking is necessary only at the
2391      head of loop.  */
2392   unsigned char *adjusted_dst_end = dst_end - 1;
2393   Lisp_Object translation_table
2394       = coding->translation_table_for_encode;
2395   int result = CODING_FINISH_NORMAL;
2396
2397   if (!NILP (Venable_character_translation) && NILP (translation_table))
2398     translation_table = Vstandard_translation_table_for_encode;
2399
2400   coding->consumed_char = 0;
2401   coding->fake_multibyte = 0;
2402   while (src < src_end && (dst_bytes
2403                            ? (dst < adjusted_dst_end)
2404                            : (dst < src - 1)))
2405     {
2406       /* SRC_BASE remembers the start position in source in each loop.
2407          The loop will be exited when there's not enough source text
2408          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2409          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2410          before exiting.  */
2411       unsigned char *src_base = src;
2412       unsigned char c1 = *src++, c2, c3, c4;
2413
2414       if (coding->composing)
2415         {
2416           if (c1 == 0xA0)
2417             {
2418               ONE_MORE_BYTE (c1);
2419               c1 &= 0x7F;
2420             }
2421           else if (c1 >= 0xA0)
2422             c1 -= 0x20;
2423           else
2424             coding->composing = 0;
2425         }
2426
2427       switch (emacs_code_class[c1])
2428         {
2429         case EMACS_ascii_code:
2430           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2431           break;
2432
2433         case EMACS_control_code:
2434           *dst++ = c1;
2435           coding->consumed_char++;
2436           break;
2437
2438         case EMACS_carriage_return_code:
2439           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2440             {
2441               *dst++ = c1;
2442               coding->consumed_char++;
2443               break;
2444             }
2445           /* fall down to treat '\r' as '\n' ...  */
2446
2447         case EMACS_linefeed_code:
2448           if (coding->eol_type == CODING_EOL_LF
2449               || coding->eol_type == CODING_EOL_UNDECIDED)
2450             *dst++ = '\n';
2451           else if (coding->eol_type == CODING_EOL_CRLF)
2452             *dst++ = '\r', *dst++ = '\n';
2453           else
2454             *dst++ = '\r';
2455           coding->consumed_char++;
2456           break;
2457
2458         case EMACS_leading_code_2:
2459           ONE_MORE_BYTE (c2);
2460           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2461           break;
2462
2463         case EMACS_leading_code_3:
2464           TWO_MORE_BYTES (c2, c3);
2465           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2466           break;
2467
2468         case EMACS_leading_code_4:
2469           THREE_MORE_BYTES (c2, c3, c4);
2470           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2471           break;
2472
2473         case EMACS_leading_code_composition:
2474           coding->composing = 1;
2475           break;
2476
2477         default:                /* i.e. case EMACS_invalid_code: */
2478           *dst++ = c1;
2479           coding->consumed_char++;
2480         }
2481       continue;
2482
2483     label_end_of_loop:
2484       result = CODING_FINISH_INSUFFICIENT_SRC;
2485       src = src_base;
2486       break;
2487     }
2488
2489   if (result == CODING_FINISH_NORMAL
2490       && src < src_end)
2491     result = CODING_FINISH_INSUFFICIENT_DST;
2492   coding->consumed = src - source;
2493   coding->produced = coding->produced_char = dst - destination;
2494   return result;
2495 }
2496
2497 \f
2498 /*** 5. CCL handlers ***/
2499
2500 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2501    Check if a text is encoded in a coding system of which
2502    encoder/decoder are written in CCL program.  If it is, return
2503    CODING_CATEGORY_MASK_CCL, else return 0.  */
2504
2505 int
2506 detect_coding_ccl (src, src_end)
2507      unsigned char *src, *src_end;
2508 {
2509   unsigned char *valid;
2510
2511   /* No coding system is assigned to coding-category-ccl.  */
2512   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2513     return 0;
2514
2515   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2516   while (src < src_end)
2517     {
2518       if (! valid[*src]) return 0;
2519       src++;
2520     }
2521   return CODING_CATEGORY_MASK_CCL;
2522 }
2523
2524 \f
2525 /*** 6. End-of-line handlers ***/
2526
2527 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2528    This function is called only when `coding->eol_type' is
2529    CODING_EOL_CRLF or CODING_EOL_CR.  */
2530
2531 int
2532 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2533      struct coding_system *coding;
2534      unsigned char *source, *destination;
2535      int src_bytes, dst_bytes;
2536 {
2537   unsigned char *src = source;
2538   unsigned char *src_end = source + src_bytes;
2539   unsigned char *dst = destination;
2540   unsigned char *dst_end = destination + dst_bytes;
2541   unsigned char c;
2542   int result = CODING_FINISH_NORMAL;
2543
2544   coding->fake_multibyte = 0;
2545
2546   if (src_bytes <= 0)
2547     return result;
2548
2549   switch (coding->eol_type)
2550     {
2551     case CODING_EOL_CRLF:
2552       {
2553         /* Since the maximum bytes produced by each loop is 2, we
2554            subtract 1 from DST_END to assure overflow checking is
2555            necessary only at the head of loop.  */
2556         unsigned char *adjusted_dst_end = dst_end - 1;
2557
2558         while (src < src_end && (dst_bytes
2559                                  ? (dst < adjusted_dst_end)
2560                                  : (dst < src - 1)))
2561           {
2562             unsigned char *src_base = src;
2563
2564             c = *src++;
2565             if (c == '\r')
2566               {
2567                 ONE_MORE_BYTE (c);
2568                 if (c == '\n')
2569                   *dst++ = c;
2570                 else
2571                   {
2572                     if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2573                       {
2574                         result = CODING_FINISH_INCONSISTENT_EOL;
2575                         goto label_end_of_loop_2;
2576                       }
2577                     src--;
2578                     *dst++ = '\r';
2579                     if (BASE_LEADING_CODE_P (c))
2580                       coding->fake_multibyte = 1;
2581                   }
2582               }
2583             else if (c == '\n'
2584                      && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2585               {
2586                 result = CODING_FINISH_INCONSISTENT_EOL;
2587                 goto label_end_of_loop_2;
2588               }
2589             else
2590               {
2591                 *dst++ = c;
2592                 if (BASE_LEADING_CODE_P (c))
2593                   coding->fake_multibyte = 1;
2594               }
2595             continue;
2596
2597           label_end_of_loop:
2598             result = CODING_FINISH_INSUFFICIENT_SRC;
2599           label_end_of_loop_2:
2600             src = src_base;
2601             break;
2602           }
2603         if (src < src_end)
2604           {
2605             if (result == CODING_FINISH_NORMAL)
2606               result = CODING_FINISH_INSUFFICIENT_DST;
2607             else if (result != CODING_FINISH_INCONSISTENT_EOL
2608                      && coding->mode & CODING_MODE_LAST_BLOCK)
2609               {
2610                 /* This is the last block of the text to be decoded.
2611                    We flush out all remaining codes.  */
2612                 src_bytes = src_end - src;
2613                 if (dst_bytes && (dst_end - dst < src_bytes))
2614                   src_bytes = dst_end - dst;
2615                 bcopy (src, dst, src_bytes);
2616                 dst += src_bytes;
2617                 src += src_bytes;
2618               }
2619           }
2620       }
2621       break;
2622
2623     case CODING_EOL_CR:
2624       if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2625         {
2626           while (src < src_end)
2627             {
2628               if ((c = *src++) == '\n')
2629                 break;
2630               if (BASE_LEADING_CODE_P (c))
2631                 coding->fake_multibyte = 1;
2632             }
2633           if (*--src == '\n')
2634             {
2635               src_bytes = src - source;
2636               result = CODING_FINISH_INCONSISTENT_EOL;
2637             }
2638         }
2639       if (dst_bytes && src_bytes > dst_bytes)
2640         {
2641           result = CODING_FINISH_INSUFFICIENT_DST;
2642           src_bytes = dst_bytes;
2643         }
2644       if (dst_bytes)
2645         bcopy (source, destination, src_bytes);
2646       else
2647         safe_bcopy (source, destination, src_bytes);
2648       src = source + src_bytes;
2649       while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2650       break;
2651
2652     default:                    /* i.e. case: CODING_EOL_LF */
2653       if (dst_bytes && src_bytes > dst_bytes)
2654         {
2655           result = CODING_FINISH_INSUFFICIENT_DST;
2656           src_bytes = dst_bytes;
2657         }
2658       if (dst_bytes)
2659         bcopy (source, destination, src_bytes);
2660       else
2661         safe_bcopy (source, destination, src_bytes);
2662       src += src_bytes;
2663       dst += src_bytes;
2664       coding->fake_multibyte = 1;
2665       break;
2666     }
2667
2668   coding->consumed = coding->consumed_char = src - source;
2669   coding->produced = coding->produced_char = dst - destination;
2670   return result;
2671 }
2672
2673 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2674    format of end-of-line according to `coding->eol_type'.  If
2675    `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2676    '\r' in source text also means end-of-line.  */
2677
2678 int
2679 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2680      struct coding_system *coding;
2681      unsigned char *source, *destination;
2682      int src_bytes, dst_bytes;
2683 {
2684   unsigned char *src = source;
2685   unsigned char *dst = destination;
2686   int result = CODING_FINISH_NORMAL;
2687
2688   coding->fake_multibyte = 0;
2689
2690   if (coding->eol_type == CODING_EOL_CRLF)
2691     {
2692       unsigned char c;
2693       unsigned char *src_end = source + src_bytes;
2694       unsigned char *dst_end = destination + dst_bytes;
2695       /* Since the maximum bytes produced by each loop is 2, we
2696          subtract 1 from DST_END to assure overflow checking is
2697          necessary only at the head of loop.  */
2698       unsigned char *adjusted_dst_end = dst_end - 1;
2699
2700       while (src < src_end && (dst_bytes
2701                                ? (dst < adjusted_dst_end)
2702                                : (dst < src - 1)))
2703         {
2704           c = *src++;
2705           if (c == '\n'
2706               || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2707             *dst++ = '\r', *dst++ = '\n';
2708           else
2709             {
2710               *dst++ = c;
2711               if (BASE_LEADING_CODE_P (c))
2712                 coding->fake_multibyte = 1;
2713             }
2714         }
2715       if (src < src_end)
2716         result = CODING_FINISH_INSUFFICIENT_DST;
2717     }
2718   else
2719     {
2720       unsigned char c;
2721
2722       if (dst_bytes && src_bytes > dst_bytes)
2723         {
2724           src_bytes = dst_bytes;
2725           result = CODING_FINISH_INSUFFICIENT_DST;
2726         }
2727       if (dst_bytes)
2728         bcopy (source, destination, src_bytes);
2729       else
2730         safe_bcopy (source, destination, src_bytes);
2731       dst_bytes = src_bytes;
2732       if (coding->eol_type == CODING_EOL_CR)
2733         {
2734           while (src_bytes--)
2735             {
2736               if ((c = *dst++) == '\n')
2737                 dst[-1] = '\r';
2738               else if (BASE_LEADING_CODE_P (c))
2739                 coding->fake_multibyte = 1;
2740             }
2741         }
2742       else
2743         {
2744           if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2745             {
2746               while (src_bytes--)
2747                 if (*dst++ == '\r') dst[-1] = '\n';
2748             }
2749           coding->fake_multibyte = 1;
2750         }
2751       src = source + dst_bytes;
2752       dst = destination + dst_bytes;
2753     }
2754
2755   coding->consumed = coding->consumed_char = src - source;
2756   coding->produced = coding->produced_char = dst - destination;
2757   return result;
2758 }
2759
2760 \f
2761 /*** 7. C library functions ***/
2762
2763 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2764    has a property `coding-system'.  The value of this property is a
2765    vector of length 5 (called as coding-vector).  Among elements of
2766    this vector, the first (element[0]) and the fifth (element[4])
2767    carry important information for decoding/encoding.  Before
2768    decoding/encoding, this information should be set in fields of a
2769    structure of type `coding_system'.
2770
2771    A value of property `coding-system' can be a symbol of another
2772    subsidiary coding-system.  In that case, Emacs gets coding-vector
2773    from that symbol.
2774
2775    `element[0]' contains information to be set in `coding->type'.  The
2776    value and its meaning is as follows:
2777
2778    0 -- coding_type_emacs_mule
2779    1 -- coding_type_sjis
2780    2 -- coding_type_iso2022
2781    3 -- coding_type_big5
2782    4 -- coding_type_ccl encoder/decoder written in CCL
2783    nil -- coding_type_no_conversion
2784    t -- coding_type_undecided (automatic conversion on decoding,
2785                                no-conversion on encoding)
2786
2787    `element[4]' contains information to be set in `coding->flags' and
2788    `coding->spec'.  The meaning varies by `coding->type'.
2789
2790    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2791    of length 32 (of which the first 13 sub-elements are used now).
2792    Meanings of these sub-elements are:
2793
2794    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2795         If the value is an integer of valid charset, the charset is
2796         assumed to be designated to graphic register N initially.
2797
2798         If the value is minus, it is a minus value of charset which
2799         reserves graphic register N, which means that the charset is
2800         not designated initially but should be designated to graphic
2801         register N just before encoding a character in that charset.
2802
2803         If the value is nil, graphic register N is never used on
2804         encoding.
2805
2806    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2807         Each value takes t or nil.  See the section ISO2022 of
2808         `coding.h' for more information.
2809
2810    If `coding->type' is `coding_type_big5', element[4] is t to denote
2811    BIG5-ETen or nil to denote BIG5-HKU.
2812
2813    If `coding->type' takes the other value, element[4] is ignored.
2814
2815    Emacs Lisp's coding system also carries information about format of
2816    end-of-line in a value of property `eol-type'.  If the value is
2817    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2818    means CODING_EOL_CR.  If it is not integer, it should be a vector
2819    of subsidiary coding systems of which property `eol-type' has one
2820    of above values.
2821
2822 */
2823
2824 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2825    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2826    is setup so that no conversion is necessary and return -1, else
2827    return 0.  */
2828
2829 int
2830 setup_coding_system (coding_system, coding)
2831      Lisp_Object coding_system;
2832      struct coding_system *coding;
2833 {
2834   Lisp_Object coding_spec, coding_type, eol_type, plist;
2835   Lisp_Object val;
2836   int i;
2837
2838   /* Initialize some fields required for all kinds of coding systems.  */
2839   coding->symbol = coding_system;
2840   coding->common_flags = 0;
2841   coding->mode = 0;
2842   coding->heading_ascii = -1;
2843   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2844   coding_spec = Fget (coding_system, Qcoding_system);
2845   if (!VECTORP (coding_spec)
2846       || XVECTOR (coding_spec)->size != 5
2847       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2848     goto label_invalid_coding_system;
2849
2850   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2851   if (VECTORP (eol_type))
2852     {
2853       coding->eol_type = CODING_EOL_UNDECIDED;
2854       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2855     }
2856   else if (XFASTINT (eol_type) == 1)
2857     {
2858       coding->eol_type = CODING_EOL_CRLF;
2859       coding->common_flags
2860         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2861     }
2862   else if (XFASTINT (eol_type) == 2)
2863     {
2864       coding->eol_type = CODING_EOL_CR;
2865       coding->common_flags
2866         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2867     }
2868   else
2869     coding->eol_type = CODING_EOL_LF;
2870
2871   coding_type = XVECTOR (coding_spec)->contents[0];
2872   /* Try short cut.  */
2873   if (SYMBOLP (coding_type))
2874     {
2875       if (EQ (coding_type, Qt))
2876         {
2877           coding->type = coding_type_undecided;
2878           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2879         }
2880       else
2881         coding->type = coding_type_no_conversion;
2882       return 0;
2883     }
2884
2885   /* Initialize remaining fields.  */
2886   coding->composing = 0;
2887   coding->composed_chars = 0;
2888
2889   /* Get values of coding system properties:
2890      `post-read-conversion', `pre-write-conversion',
2891      `translation-table-for-decode', `translation-table-for-encode'.  */
2892   plist = XVECTOR (coding_spec)->contents[3];
2893   coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2894   coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2895   val = Fplist_get (plist, Qtranslation_table_for_decode);
2896   if (SYMBOLP (val))
2897     val = Fget (val, Qtranslation_table_for_decode);
2898   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2899   val = Fplist_get (plist, Qtranslation_table_for_encode);
2900   if (SYMBOLP (val))
2901     val = Fget (val, Qtranslation_table_for_encode);
2902   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2903   val = Fplist_get (plist, Qcoding_category);
2904   if (!NILP (val))
2905     {
2906       val = Fget (val, Qcoding_category_index);
2907       if (INTEGERP (val))
2908         coding->category_idx = XINT (val);
2909       else
2910         goto label_invalid_coding_system;
2911     }
2912   else
2913     goto label_invalid_coding_system;
2914
2915   val = Fplist_get (plist, Qsafe_charsets);
2916   if (EQ (val, Qt))
2917     {
2918       for (i = 0; i <= MAX_CHARSET; i++)
2919         coding->safe_charsets[i] = 1;
2920     }
2921   else
2922     {
2923       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2924       while (CONSP (val))
2925         {
2926           if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2927             coding->safe_charsets[i] = 1;
2928           val = XCONS (val)->cdr;
2929         }
2930     }
2931
2932   switch (XFASTINT (coding_type))
2933     {
2934     case 0:
2935       coding->type = coding_type_emacs_mule;
2936       if (!NILP (coding->post_read_conversion))
2937         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2938       if (!NILP (coding->pre_write_conversion))
2939         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2940       break;
2941
2942     case 1:
2943       coding->type = coding_type_sjis;
2944       coding->common_flags
2945         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2946       break;
2947
2948     case 2:
2949       coding->type = coding_type_iso2022;
2950       coding->common_flags
2951         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2952       {
2953         Lisp_Object val, temp;
2954         Lisp_Object *flags;
2955         int i, charset, reg_bits = 0;
2956
2957         val = XVECTOR (coding_spec)->contents[4];
2958
2959         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2960           goto label_invalid_coding_system;
2961
2962         flags = XVECTOR (val)->contents;
2963         coding->flags
2964           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2965              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2966              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2967              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2968              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2969              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2970              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2971              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2972              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2973              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2974              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2975              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2976              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2977              );
2978
2979         /* Invoke graphic register 0 to plane 0.  */
2980         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2981         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2982         CODING_SPEC_ISO_INVOCATION (coding, 1)
2983           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2984         /* Not single shifting at first.  */
2985         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2986         /* Beginning of buffer should also be regarded as bol. */
2987         CODING_SPEC_ISO_BOL (coding) = 1;
2988
2989         for (charset = 0; charset <= MAX_CHARSET; charset++)
2990           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2991         val = Vcharset_revision_alist;
2992         while (CONSP (val))
2993           {
2994             charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2995             if (charset >= 0
2996                 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2997                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2998               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2999             val = XCONS (val)->cdr;
3000           }
3001
3002         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3003            FLAGS[REG] can be one of below:
3004                 integer CHARSET: CHARSET occupies register I,
3005                 t: designate nothing to REG initially, but can be used
3006                   by any charsets,
3007                 list of integer, nil, or t: designate the first
3008                   element (if integer) to REG initially, the remaining
3009                   elements (if integer) is designated to REG on request,
3010                   if an element is t, REG can be used by any charsets,
3011                 nil: REG is never used.  */
3012         for (charset = 0; charset <= MAX_CHARSET; charset++)
3013           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3014             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3015         for (i = 0; i < 4; i++)
3016           {
3017             if (INTEGERP (flags[i])
3018                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3019                 || (charset = get_charset_id (flags[i])) >= 0)
3020               {
3021                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3022                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3023               }
3024             else if (EQ (flags[i], Qt))
3025               {
3026                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3027                 reg_bits |= 1 << i;
3028                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3029               }
3030             else if (CONSP (flags[i]))
3031               {
3032                 Lisp_Object tail;
3033                 tail = flags[i];
3034
3035                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3036                 if (INTEGERP (XCONS (tail)->car)
3037                     && (charset = XINT (XCONS (tail)->car),
3038                         CHARSET_VALID_P (charset))
3039                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
3040                   {
3041                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3042                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3043                   }
3044                 else
3045                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3046                 tail = XCONS (tail)->cdr;
3047                 while (CONSP (tail))
3048                   {
3049                     if (INTEGERP (XCONS (tail)->car)
3050                         && (charset = XINT (XCONS (tail)->car),
3051                             CHARSET_VALID_P (charset))
3052                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
3053                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3054                         = i;
3055                     else if (EQ (XCONS (tail)->car, Qt))
3056                       reg_bits |= 1 << i;
3057                     tail = XCONS (tail)->cdr;
3058                   }
3059               }
3060             else
3061               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3062
3063             CODING_SPEC_ISO_DESIGNATION (coding, i)
3064               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3065           }
3066
3067         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3068           {
3069             /* REG 1 can be used only by locking shift in 7-bit env.  */
3070             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3071               reg_bits &= ~2;
3072             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3073               /* Without any shifting, only REG 0 and 1 can be used.  */
3074               reg_bits &= 3;
3075           }
3076
3077         if (reg_bits)
3078           for (charset = 0; charset <= MAX_CHARSET; charset++)
3079             {
3080               if (CHARSET_VALID_P (charset))
3081                 {
3082                   /* There exist some default graphic registers to be
3083                      used CHARSET.  */
3084
3085                   /* We had better avoid designating a charset of
3086                      CHARS96 to REG 0 as far as possible.  */
3087                   if (CHARSET_CHARS (charset) == 96)
3088                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3089                       = (reg_bits & 2
3090                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3091                   else
3092                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3093                       = (reg_bits & 1
3094                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3095                 }
3096             }
3097       }
3098       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3099       coding->spec.iso2022.last_invalid_designation_register = -1;
3100       break;
3101
3102     case 3:
3103       coding->type = coding_type_big5;
3104       coding->common_flags
3105         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3106       coding->flags
3107         = (NILP (XVECTOR (coding_spec)->contents[4])
3108            ? CODING_FLAG_BIG5_HKU
3109            : CODING_FLAG_BIG5_ETEN);
3110       break;
3111
3112     case 4:
3113       coding->type = coding_type_ccl;
3114       coding->common_flags
3115         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3116       {
3117         Lisp_Object val;
3118         Lisp_Object decoder, encoder;
3119
3120         val = XVECTOR (coding_spec)->contents[4];
3121         if (CONSP  (val)
3122             && SYMBOLP (XCONS (val)->car)
3123             && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx))
3124             && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder)))
3125             && SYMBOLP (XCONS (val)->cdr)
3126             && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx))
3127             && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder))))
3128           {
3129             setup_ccl_program (&(coding->spec.ccl.decoder), decoder);
3130             setup_ccl_program (&(coding->spec.ccl.encoder), encoder);
3131           }
3132         else
3133           goto label_invalid_coding_system;
3134
3135         bzero (coding->spec.ccl.valid_codes, 256);
3136         val = Fplist_get (plist, Qvalid_codes);
3137         if (CONSP (val))
3138           {
3139             Lisp_Object this;
3140
3141             for (; CONSP (val); val = XCONS (val)->cdr)
3142               {
3143                 this = XCONS (val)->car;
3144                 if (INTEGERP (this)
3145                     && XINT (this) >= 0 && XINT (this) < 256)
3146                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3147                 else if (CONSP (this)
3148                          && INTEGERP (XCONS (this)->car)
3149                          && INTEGERP (XCONS (this)->cdr))
3150                   {
3151                     int start = XINT (XCONS (this)->car);
3152                     int end = XINT (XCONS (this)->cdr);
3153
3154                     if (start >= 0 && start <= end && end < 256)
3155                       while (start <= end)
3156                         coding->spec.ccl.valid_codes[start++] = 1;
3157                   }
3158               }
3159           }
3160       }
3161       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3162       break;
3163
3164     case 5:
3165       coding->type = coding_type_raw_text;
3166       break;
3167
3168     default:
3169       goto label_invalid_coding_system;
3170     }
3171   return 0;
3172
3173  label_invalid_coding_system:
3174   coding->type = coding_type_no_conversion;
3175   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3176   coding->common_flags = 0;
3177   coding->eol_type = CODING_EOL_LF;
3178   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3179   return -1;
3180 }
3181
3182 /* Setup raw-text or one of its subsidiaries in the structure
3183    coding_system CODING according to the already setup value eol_type
3184    in CODING.  CODING should be setup for some coding system in
3185    advance.  */
3186
3187 void
3188 setup_raw_text_coding_system (coding)
3189      struct coding_system *coding;
3190 {
3191   if (coding->type != coding_type_raw_text)
3192     {
3193       coding->symbol = Qraw_text;
3194       coding->type = coding_type_raw_text;
3195       if (coding->eol_type != CODING_EOL_UNDECIDED)
3196         {
3197           Lisp_Object subsidiaries;
3198           subsidiaries = Fget (Qraw_text, Qeol_type);
3199
3200           if (VECTORP (subsidiaries)
3201               && XVECTOR (subsidiaries)->size == 3)
3202             coding->symbol
3203               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3204         }
3205     }
3206   return;
3207 }
3208
3209 /* Emacs has a mechanism to automatically detect a coding system if it
3210    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3211    it's impossible to distinguish some coding systems accurately
3212    because they use the same range of codes.  So, at first, coding
3213    systems are categorized into 7, those are:
3214
3215    o coding-category-emacs-mule
3216
3217         The category for a coding system which has the same code range
3218         as Emacs' internal format.  Assigned the coding-system (Lisp
3219         symbol) `emacs-mule' by default.
3220
3221    o coding-category-sjis
3222
3223         The category for a coding system which has the same code range
3224         as SJIS.  Assigned the coding-system (Lisp
3225         symbol) `japanese-shift-jis' by default.
3226
3227    o coding-category-iso-7
3228
3229         The category for a coding system which has the same code range
3230         as ISO2022 of 7-bit environment.  This doesn't use any locking
3231         shift and single shift functions.  This can encode/decode all
3232         charsets.  Assigned the coding-system (Lisp symbol)
3233         `iso-2022-7bit' by default.
3234
3235    o coding-category-iso-7-tight
3236
3237         Same as coding-category-iso-7 except that this can
3238         encode/decode only the specified charsets.
3239
3240    o coding-category-iso-8-1
3241
3242         The category for a coding system which has the same code range
3243         as ISO2022 of 8-bit environment and graphic plane 1 used only
3244         for DIMENSION1 charset.  This doesn't use any locking shift
3245         and single shift functions.  Assigned the coding-system (Lisp
3246         symbol) `iso-latin-1' by default.
3247
3248    o coding-category-iso-8-2
3249
3250         The category for a coding system which has the same code range
3251         as ISO2022 of 8-bit environment and graphic plane 1 used only
3252         for DIMENSION2 charset.  This doesn't use any locking shift
3253         and single shift functions.  Assigned the coding-system (Lisp
3254         symbol) `japanese-iso-8bit' by default.
3255
3256    o coding-category-iso-7-else
3257
3258         The category for a coding system which has the same code range
3259         as ISO2022 of 7-bit environemnt but uses locking shift or
3260         single shift functions.  Assigned the coding-system (Lisp
3261         symbol) `iso-2022-7bit-lock' by default.
3262
3263    o coding-category-iso-8-else
3264
3265         The category for a coding system which has the same code range
3266         as ISO2022 of 8-bit environemnt but uses locking shift or
3267         single shift functions.  Assigned the coding-system (Lisp
3268         symbol) `iso-2022-8bit-ss2' by default.
3269
3270    o coding-category-big5
3271
3272         The category for a coding system which has the same code range
3273         as BIG5.  Assigned the coding-system (Lisp symbol)
3274         `cn-big5' by default.
3275
3276    o coding-category-ccl
3277
3278         The category for a coding system of which encoder/decoder is
3279         written in CCL programs.  The default value is nil, i.e., no
3280         coding system is assigned.
3281
3282    o coding-category-binary
3283
3284         The category for a coding system not categorized in any of the
3285         above.  Assigned the coding-system (Lisp symbol)
3286         `no-conversion' by default.
3287
3288    Each of them is a Lisp symbol and the value is an actual
3289    `coding-system's (this is also a Lisp symbol) assigned by a user.
3290    What Emacs does actually is to detect a category of coding system.
3291    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3292    decide only one possible category, it selects a category of the
3293    highest priority.  Priorities of categories are also specified by a
3294    user in a Lisp variable `coding-category-list'.
3295
3296 */
3297
3298 static
3299 int ascii_skip_code[256];
3300
3301 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3302    If it detects possible coding systems, return an integer in which
3303    appropriate flag bits are set.  Flag bits are defined by macros
3304    CODING_CATEGORY_MASK_XXX in `coding.h'.
3305
3306    How many ASCII characters are at the head is returned as *SKIP.  */
3307
3308 static int
3309 detect_coding_mask (source, src_bytes, priorities, skip)
3310      unsigned char *source;
3311      int src_bytes, *priorities, *skip;
3312 {
3313   register unsigned char c;
3314   unsigned char *src = source, *src_end = source + src_bytes;
3315   unsigned int mask;
3316   int i;
3317
3318   /* At first, skip all ASCII characters and control characters except
3319      for three ISO2022 specific control characters.  */
3320   ascii_skip_code[ISO_CODE_SO] = 0;
3321   ascii_skip_code[ISO_CODE_SI] = 0;
3322   ascii_skip_code[ISO_CODE_ESC] = 0;
3323
3324  label_loop_detect_coding:
3325   while (src < src_end && ascii_skip_code[*src]) src++;
3326   *skip = src - source;
3327
3328   if (src >= src_end)
3329     /* We found nothing other than ASCII.  There's nothing to do.  */
3330     return 0;
3331
3332   c = *src;
3333   /* The text seems to be encoded in some multilingual coding system.
3334      Now, try to find in which coding system the text is encoded.  */
3335   if (c < 0x80)
3336     {
3337       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3338       /* C is an ISO2022 specific control code of C0.  */
3339       mask = detect_coding_iso2022 (src, src_end);
3340       if (mask == 0)
3341         {
3342           /* No valid ISO2022 code follows C.  Try again.  */
3343           src++;
3344           if (c == ISO_CODE_ESC)
3345             ascii_skip_code[ISO_CODE_ESC] = 1;
3346           else
3347             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3348           goto label_loop_detect_coding;
3349         }
3350       if (priorities)
3351         goto label_return_highest_only;
3352     }
3353   else
3354     {
3355       int try;
3356
3357       if (c < 0xA0)
3358         {
3359           /* C is the first byte of SJIS character code,
3360              or a leading-code of Emacs' internal format (emacs-mule).  */
3361           try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3362
3363           /* Or, if C is a special latin extra code,
3364              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3365              or is an ISO2022 control-sequence-introducer (CSI),
3366              we should also consider the possibility of ISO2022 codings.  */
3367           if ((VECTORP (Vlatin_extra_code_table)
3368                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3369               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3370               || (c == ISO_CODE_CSI
3371                   && (src < src_end
3372                       && (*src == ']'
3373                           || ((*src == '0' || *src == '1' || *src == '2')
3374                               && src + 1 < src_end
3375                               && src[1] == ']')))))
3376             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3377                      | CODING_CATEGORY_MASK_ISO_8BIT);
3378         }
3379       else
3380         /* C is a character of ISO2022 in graphic plane right,
3381            or a SJIS's 1-byte character code (i.e. JISX0201),
3382            or the first byte of BIG5's 2-byte code.  */
3383         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3384                 | CODING_CATEGORY_MASK_ISO_8BIT
3385                 | CODING_CATEGORY_MASK_SJIS
3386                 | CODING_CATEGORY_MASK_BIG5);
3387
3388       /* Or, we may have to consider the possibility of CCL.  */
3389       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3390           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3391               ->spec.ccl.valid_codes)[c])
3392         try |= CODING_CATEGORY_MASK_CCL;
3393
3394       mask = 0;
3395       if (priorities)
3396         {
3397           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3398             {
3399               if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
3400                 mask = detect_coding_iso2022 (src, src_end);
3401               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3402                 mask = detect_coding_sjis (src, src_end);
3403               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3404                 mask = detect_coding_big5 (src, src_end);
3405               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3406                 mask = detect_coding_emacs_mule (src, src_end);
3407               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3408                 mask = detect_coding_ccl (src, src_end);
3409               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3410                 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3411               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3412                 mask = CODING_CATEGORY_MASK_BINARY;
3413               if (mask)
3414                 goto label_return_highest_only;
3415             }
3416           return CODING_CATEGORY_MASK_RAW_TEXT;
3417         }
3418       if (try & CODING_CATEGORY_MASK_ISO)
3419         mask |= detect_coding_iso2022 (src, src_end);
3420       if (try & CODING_CATEGORY_MASK_SJIS)
3421         mask |= detect_coding_sjis (src, src_end);
3422       if (try & CODING_CATEGORY_MASK_BIG5)
3423         mask |= detect_coding_big5 (src, src_end);
3424       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3425         mask |= detect_coding_emacs_mule (src, src_end);
3426       if (try & CODING_CATEGORY_MASK_CCL)
3427         mask |= detect_coding_ccl (src, src_end);
3428     }
3429   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3430
3431  label_return_highest_only:
3432   for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3433     {
3434       if (mask & priorities[i])
3435         return priorities[i];
3436     }
3437   return CODING_CATEGORY_MASK_RAW_TEXT;
3438 }
3439
3440 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3441    The information of the detected coding system is set in CODING.  */
3442
3443 void
3444 detect_coding (coding, src, src_bytes)
3445      struct coding_system *coding;
3446      unsigned char *src;
3447      int src_bytes;
3448 {
3449   unsigned int idx;
3450   int skip, mask, i;
3451   Lisp_Object val;
3452
3453   val = Vcoding_category_list;
3454   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3455   coding->heading_ascii = skip;
3456
3457   if (!mask) return;
3458
3459   /* We found a single coding system of the highest priority in MASK.  */
3460   idx = 0;
3461   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3462   if (! mask)
3463     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3464
3465   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3466
3467   if (coding->eol_type != CODING_EOL_UNDECIDED)
3468     {
3469       Lisp_Object tmp;
3470
3471       tmp = Fget (val, Qeol_type);
3472       if (VECTORP (tmp))
3473         val = XVECTOR (tmp)->contents[coding->eol_type];
3474     }
3475   setup_coding_system (val, coding);
3476   /* Set this again because setup_coding_system reset this member.  */
3477   coding->heading_ascii = skip;
3478 }
3479
3480 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3481    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3482    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3483
3484    How many non-eol characters are at the head is returned as *SKIP.  */
3485
3486 #define MAX_EOL_CHECK_COUNT 3
3487
3488 static int
3489 detect_eol_type (source, src_bytes, skip)
3490      unsigned char *source;
3491      int src_bytes, *skip;
3492 {
3493   unsigned char *src = source, *src_end = src + src_bytes;
3494   unsigned char c;
3495   int total = 0;                /* How many end-of-lines are found so far.  */
3496   int eol_type = CODING_EOL_UNDECIDED;
3497   int this_eol_type;
3498
3499   *skip = 0;
3500
3501   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3502     {
3503       c = *src++;
3504       if (c == '\n' || c == '\r')
3505         {
3506           if (*skip == 0)
3507             *skip = src - 1 - source;
3508           total++;
3509           if (c == '\n')
3510             this_eol_type = CODING_EOL_LF;
3511           else if (src >= src_end || *src != '\n')
3512             this_eol_type = CODING_EOL_CR;
3513           else
3514             this_eol_type = CODING_EOL_CRLF, src++;
3515
3516           if (eol_type == CODING_EOL_UNDECIDED)
3517             /* This is the first end-of-line.  */
3518             eol_type = this_eol_type;
3519           else if (eol_type != this_eol_type)
3520             {
3521               /* The found type is different from what found before.  */
3522               eol_type = CODING_EOL_INCONSISTENT;
3523               break;
3524             }
3525         }
3526     }
3527
3528   if (*skip == 0)
3529     *skip = src_end - source;
3530   return eol_type;
3531 }
3532
3533 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3534    is encoded.  If it detects an appropriate format of end-of-line, it
3535    sets the information in *CODING.  */
3536
3537 void
3538 detect_eol (coding, src, src_bytes)
3539      struct coding_system *coding;
3540      unsigned char *src;
3541      int src_bytes;
3542 {
3543   Lisp_Object val;
3544   int skip;
3545   int eol_type = detect_eol_type (src, src_bytes, &skip);
3546
3547   if (coding->heading_ascii > skip)
3548     coding->heading_ascii = skip;
3549   else
3550     skip = coding->heading_ascii;
3551
3552   if (eol_type == CODING_EOL_UNDECIDED)
3553     return;
3554   if (eol_type == CODING_EOL_INCONSISTENT)
3555     {
3556 #if 0
3557       /* This code is suppressed until we find a better way to
3558          distinguish raw text file and binary file.  */
3559
3560       /* If we have already detected that the coding is raw-text, the
3561          coding should actually be no-conversion.  */
3562       if (coding->type == coding_type_raw_text)
3563         {
3564           setup_coding_system (Qno_conversion, coding);
3565           return;
3566         }
3567       /* Else, let's decode only text code anyway.  */
3568 #endif /* 0 */
3569       eol_type = CODING_EOL_LF;
3570     }
3571
3572   val = Fget (coding->symbol, Qeol_type);
3573   if (VECTORP (val) && XVECTOR (val)->size == 3)
3574     {
3575       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3576       coding->heading_ascii = skip;
3577     }
3578 }
3579
3580 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3581
3582 #define DECODING_BUFFER_MAG(coding)                                          \
3583   (coding->type == coding_type_iso2022                                       \
3584    ? 3                                                                       \
3585    : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3586       ? 2                                                                    \
3587       : (coding->type == coding_type_raw_text                                \
3588          ? 1                                                                 \
3589          : (coding->type == coding_type_ccl                                  \
3590             ? coding->spec.ccl.decoder.buf_magnification                     \
3591             : 2))))
3592
3593 /* Return maximum size (bytes) of a buffer enough for decoding
3594    SRC_BYTES of text encoded in CODING.  */
3595
3596 int
3597 decoding_buffer_size (coding, src_bytes)
3598      struct coding_system *coding;
3599      int src_bytes;
3600 {
3601   return (src_bytes * DECODING_BUFFER_MAG (coding)
3602           + CONVERSION_BUFFER_EXTRA_ROOM);
3603 }
3604
3605 /* Return maximum size (bytes) of a buffer enough for encoding
3606    SRC_BYTES of text to CODING.  */
3607
3608 int
3609 encoding_buffer_size (coding, src_bytes)
3610      struct coding_system *coding;
3611      int src_bytes;
3612 {
3613   int magnification;
3614
3615   if (coding->type == coding_type_ccl)
3616     magnification = coding->spec.ccl.encoder.buf_magnification;
3617   else
3618     magnification = 3;
3619
3620   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3621 }
3622
3623 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3624 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3625 #endif
3626
3627 char *conversion_buffer;
3628 int conversion_buffer_size;
3629
3630 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3631    or decoding.  Sufficient memory is allocated automatically.  If we
3632    run out of memory, return NULL.  */
3633
3634 char *
3635 get_conversion_buffer (size)
3636      int size;
3637 {
3638   if (size > conversion_buffer_size)
3639     {
3640       char *buf;
3641       int real_size = conversion_buffer_size * 2;
3642
3643       while (real_size < size) real_size *= 2;
3644       buf = (char *) xmalloc (real_size);
3645       xfree (conversion_buffer);
3646       conversion_buffer = buf;
3647       conversion_buffer_size = real_size;
3648     }
3649   return conversion_buffer;
3650 }
3651
3652 int
3653 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3654      struct coding_system *coding;
3655      unsigned char *source, *destination;
3656      int src_bytes, dst_bytes, encodep;
3657 {
3658   struct ccl_program *ccl
3659     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3660   int result;
3661
3662   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3663
3664   coding->produced = ccl_driver (ccl, source, destination,
3665                                  src_bytes, dst_bytes, &(coding->consumed));
3666   coding->produced_char
3667     = (encodep
3668        ? coding->produced
3669        : multibyte_chars_in_text (destination, coding->produced));
3670   coding->consumed_char
3671     = multibyte_chars_in_text (source, coding->consumed);
3672
3673   switch (ccl->status)
3674     {
3675     case CCL_STAT_SUSPEND_BY_SRC:
3676       result = CODING_FINISH_INSUFFICIENT_SRC;
3677       break;
3678     case CCL_STAT_SUSPEND_BY_DST:
3679       result = CODING_FINISH_INSUFFICIENT_DST;
3680       break;
3681     case CCL_STAT_QUIT:
3682     case CCL_STAT_INVALID_CMD:
3683       result = CODING_FINISH_INTERRUPT;
3684       break;
3685     default:
3686       result = CODING_FINISH_NORMAL;
3687       break;
3688     }
3689   return result;
3690 }
3691
3692 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
3693    decoding, it may detect coding system and format of end-of-line if
3694    those are not yet decided.  */
3695
3696 int
3697 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3698      struct coding_system *coding;
3699      unsigned char *source, *destination;
3700      int src_bytes, dst_bytes;
3701 {
3702   int result;
3703
3704   if (src_bytes <= 0
3705       && coding->type != coding_type_ccl
3706       && ! (coding->mode & CODING_MODE_LAST_BLOCK
3707             && CODING_REQUIRE_FLUSHING (coding)))
3708     {
3709       coding->produced = coding->produced_char = 0;
3710       coding->consumed = coding->consumed_char = 0;
3711       coding->fake_multibyte = 0;
3712       return CODING_FINISH_NORMAL;
3713     }
3714
3715   if (coding->type == coding_type_undecided)
3716     detect_coding (coding, source, src_bytes);
3717
3718   if (coding->eol_type == CODING_EOL_UNDECIDED)
3719     detect_eol (coding, source, src_bytes);
3720
3721   switch (coding->type)
3722     {
3723     case coding_type_emacs_mule:
3724     case coding_type_undecided:
3725     case coding_type_raw_text:
3726       if (coding->eol_type == CODING_EOL_LF
3727           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3728         goto label_no_conversion;
3729       result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3730       break;
3731
3732     case coding_type_sjis:
3733       result = decode_coding_sjis_big5 (coding, source, destination,
3734                                         src_bytes, dst_bytes, 1);
3735       break;
3736
3737     case coding_type_iso2022:
3738       result = decode_coding_iso2022 (coding, source, destination,
3739                                       src_bytes, dst_bytes);
3740       break;
3741
3742     case coding_type_big5:
3743       result = decode_coding_sjis_big5 (coding, source, destination,
3744                                         src_bytes, dst_bytes, 0);
3745       break;
3746
3747     case coding_type_ccl:
3748       result = ccl_coding_driver (coding, source, destination,
3749                                   src_bytes, dst_bytes, 0);
3750       break;
3751
3752     default:                    /* i.e. case coding_type_no_conversion: */
3753     label_no_conversion:
3754       if (dst_bytes && src_bytes > dst_bytes)
3755         {
3756           coding->produced = dst_bytes;
3757           result = CODING_FINISH_INSUFFICIENT_DST;
3758         }
3759       else
3760         {
3761           coding->produced = src_bytes;
3762           result = CODING_FINISH_NORMAL;
3763         }
3764       if (dst_bytes)
3765         bcopy (source, destination, coding->produced);
3766       else
3767         safe_bcopy (source, destination, coding->produced);
3768       coding->fake_multibyte = 1;
3769       coding->consumed
3770         = coding->consumed_char = coding->produced_char = coding->produced;
3771       break;
3772     }
3773
3774   return result;
3775 }
3776
3777 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
3778
3779 int
3780 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3781      struct coding_system *coding;
3782      unsigned char *source, *destination;
3783      int src_bytes, dst_bytes;
3784 {
3785   int result;
3786
3787   if (src_bytes <= 0
3788       && ! (coding->mode & CODING_MODE_LAST_BLOCK
3789             && CODING_REQUIRE_FLUSHING (coding)))
3790     {
3791       coding->produced = coding->produced_char = 0;
3792       coding->consumed = coding->consumed_char = 0;
3793       coding->fake_multibyte = 0;
3794       return CODING_FINISH_NORMAL;
3795     }
3796
3797   switch (coding->type)
3798     {
3799     case coding_type_emacs_mule:
3800     case coding_type_undecided:
3801     case coding_type_raw_text:
3802       if (coding->eol_type == CODING_EOL_LF
3803           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3804         goto label_no_conversion;
3805       result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
3806       break;
3807
3808     case coding_type_sjis:
3809       result = encode_coding_sjis_big5 (coding, source, destination,
3810                                         src_bytes, dst_bytes, 1);
3811       break;
3812
3813     case coding_type_iso2022:
3814       result = encode_coding_iso2022 (coding, source, destination,
3815                                       src_bytes, dst_bytes);
3816       break;
3817
3818     case coding_type_big5:
3819       result = encode_coding_sjis_big5 (coding, source, destination,
3820                                         src_bytes, dst_bytes, 0);
3821       break;
3822
3823     case coding_type_ccl:
3824       result = ccl_coding_driver (coding, source, destination,
3825                                   src_bytes, dst_bytes, 1);
3826       break;
3827
3828     default:                    /* i.e. case coding_type_no_conversion: */
3829     label_no_conversion:
3830       if (dst_bytes && src_bytes > dst_bytes)
3831         {
3832           coding->produced = dst_bytes;
3833           result = CODING_FINISH_INSUFFICIENT_DST;
3834         }
3835       else
3836         {
3837           coding->produced = src_bytes;
3838           result = CODING_FINISH_NORMAL;
3839         }
3840       if (dst_bytes)
3841         bcopy (source, destination, coding->produced);
3842       else
3843         safe_bcopy (source, destination, coding->produced);
3844       if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3845         {
3846           unsigned char *p = destination, *pend = p + coding->produced;
3847           while (p < pend)
3848             if (*p++ == '\015') p[-1] = '\n';
3849         }
3850       coding->fake_multibyte = 1;
3851       coding->consumed
3852         = coding->consumed_char = coding->produced_char = coding->produced;
3853       break;
3854     }
3855
3856   return result;
3857 }
3858
3859 /* Scan text in the region between *BEG and *END (byte positions),
3860    skip characters which we don't have to decode by coding system
3861    CODING at the head and tail, then set *BEG and *END to the region
3862    of the text we actually have to convert.  The caller should move
3863    the gap out of the region in advance.
3864
3865    If STR is not NULL, *BEG and *END are indices into STR.  */
3866
3867 static void
3868 shrink_decoding_region (beg, end, coding, str)
3869      int *beg, *end;
3870      struct coding_system *coding;
3871      unsigned char *str;
3872 {
3873   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
3874   int eol_conversion;
3875   Lisp_Object translation_table;
3876
3877   if (coding->type == coding_type_ccl
3878       || coding->type == coding_type_undecided
3879       || !NILP (coding->post_read_conversion))
3880     {
3881       /* We can't skip any data.  */
3882       return;
3883     }
3884   else if (coding->type == coding_type_no_conversion)
3885     {
3886       /* We need no conversion, but don't have to skip any data here.
3887          Decoding routine handles them effectively anyway.  */
3888       return;
3889     }
3890
3891   translation_table = coding->translation_table_for_decode;
3892   if (NILP (translation_table) && !NILP (Venable_character_translation))
3893     translation_table = Vstandard_translation_table_for_decode;
3894   if (CHAR_TABLE_P (translation_table))
3895     {
3896       int i;
3897       for (i = 0; i < 128; i++)
3898         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3899           break;
3900       if (i < 128)
3901         /* Some ASCII character should be tranlsated.  We give up
3902            shrinking.  */
3903         return;
3904     }
3905
3906   eol_conversion = (coding->eol_type != CODING_EOL_LF);
3907
3908   if ((! eol_conversion) && (coding->heading_ascii >= 0))
3909     /* Detection routine has already found how much we can skip at the
3910        head.  */
3911     *beg += coding->heading_ascii;
3912
3913   if (str)
3914     {
3915       begp_orig = begp = str + *beg;
3916       endp_orig = endp = str + *end;
3917     }
3918   else
3919     {
3920       begp_orig = begp = BYTE_POS_ADDR (*beg);
3921       endp_orig = endp = begp + *end - *beg;
3922     }
3923
3924   switch (coding->type)
3925     {
3926     case coding_type_emacs_mule:
3927     case coding_type_raw_text:
3928       if (eol_conversion)
3929         {
3930           if (coding->heading_ascii < 0)
3931             while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
3932           while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
3933             endp--;
3934           /* Do not consider LF as ascii if preceded by CR, since that
3935              confuses eol decoding. */
3936           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3937             endp++;
3938         }
3939       else
3940         begp = endp;
3941       break;
3942
3943     case coding_type_sjis:
3944     case coding_type_big5:
3945       /* We can skip all ASCII characters at the head.  */
3946       if (coding->heading_ascii < 0)
3947         {
3948           if (eol_conversion)
3949             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
3950           else
3951             while (begp < endp && *begp < 0x80) begp++;
3952         }
3953       /* We can skip all ASCII characters at the tail except for the
3954          second byte of SJIS or BIG5 code.  */
3955       if (eol_conversion)
3956         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
3957       else
3958         while (begp < endp && endp[-1] < 0x80) endp--;
3959       /* Do not consider LF as ascii if preceded by CR, since that
3960          confuses eol decoding. */
3961       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3962         endp++;
3963       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3964         endp++;
3965       break;
3966
3967     default:            /* i.e. case coding_type_iso2022: */
3968       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
3969         /* We can't skip any data.  */
3970         break;
3971       if (coding->heading_ascii < 0)
3972         {
3973           /* We can skip all ASCII characters at the head except for a
3974              few control codes.  */
3975           while (begp < endp && (c = *begp) < 0x80
3976                  && c != ISO_CODE_CR && c != ISO_CODE_SO
3977                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
3978                  && (!eol_conversion || c != ISO_CODE_LF))
3979             begp++;
3980         }
3981       switch (coding->category_idx)
3982         {
3983         case CODING_CATEGORY_IDX_ISO_8_1:
3984         case CODING_CATEGORY_IDX_ISO_8_2:
3985           /* We can skip all ASCII characters at the tail.  */
3986           if (eol_conversion)
3987             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
3988           else
3989             while (begp < endp && endp[-1] < 0x80) endp--;
3990           /* Do not consider LF as ascii if preceded by CR, since that
3991              confuses eol decoding. */
3992           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3993             endp++;
3994           break;
3995
3996         case CODING_CATEGORY_IDX_ISO_7:
3997         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
3998           {
3999             /* We can skip all charactes at the tail except for 8-bit
4000                codes and ESC and the following 2-byte at the tail.  */
4001             unsigned char *eight_bit = NULL;
4002
4003             if (eol_conversion)
4004               while (begp < endp
4005                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4006                 {
4007                   if (!eight_bit && c & 0x80) eight_bit = endp;
4008                   endp--;
4009                 }
4010             else
4011               while (begp < endp
4012                      && (c = endp[-1]) != ISO_CODE_ESC)
4013                 {
4014                   if (!eight_bit && c & 0x80) eight_bit = endp;
4015                   endp--;
4016                 }
4017             /* Do not consider LF as ascii if preceded by CR, since that
4018                confuses eol decoding. */
4019             if (begp < endp && endp < endp_orig
4020                 && endp[-1] == '\r' && endp[0] == '\n')
4021               endp++;
4022             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4023               {
4024                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4025                   /* This is an ASCII designation sequence.  We can
4026                      surely skip the tail.  But, if we have
4027                      encountered an 8-bit code, skip only the codes
4028                      after that.  */
4029                   endp = eight_bit ? eight_bit : endp + 2;
4030                 else
4031                   /* Hmmm, we can't skip the tail.  */
4032                   endp = endp_orig;
4033               }
4034             else if (eight_bit)
4035               endp = eight_bit;
4036           }
4037         }
4038     }
4039   *beg += begp - begp_orig;
4040   *end += endp - endp_orig;
4041   return;
4042 }
4043
4044 /* Like shrink_decoding_region but for encoding.  */
4045
4046 static void
4047 shrink_encoding_region (beg, end, coding, str)
4048      int *beg, *end;
4049      struct coding_system *coding;
4050      unsigned char *str;
4051 {
4052   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4053   int eol_conversion;
4054   Lisp_Object translation_table;
4055
4056   if (coding->type == coding_type_ccl)
4057     /* We can't skip any data.  */
4058     return;
4059   else if (coding->type == coding_type_no_conversion)
4060     {
4061       /* We need no conversion.  */
4062       *beg = *end;
4063       return;
4064     }
4065
4066   translation_table = coding->translation_table_for_encode;
4067   if (NILP (translation_table) && !NILP (Venable_character_translation))
4068     translation_table = Vstandard_translation_table_for_encode;
4069   if (CHAR_TABLE_P (translation_table))
4070     {
4071       int i;
4072       for (i = 0; i < 128; i++)
4073         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4074           break;
4075       if (i < 128)
4076         /* Some ASCII character should be tranlsated.  We give up
4077            shrinking.  */
4078         return;
4079     }
4080
4081   if (str)
4082     {
4083       begp_orig = begp = str + *beg;
4084       endp_orig = endp = str + *end;
4085     }
4086   else
4087     {
4088       begp_orig = begp = BYTE_POS_ADDR (*beg);
4089       endp_orig = endp = begp + *end - *beg;
4090     }
4091
4092   eol_conversion = (coding->eol_type == CODING_EOL_CR
4093                     || coding->eol_type == CODING_EOL_CRLF);
4094
4095   /* Here, we don't have to check coding->pre_write_conversion because
4096      the caller is expected to have handled it already.  */
4097   switch (coding->type)
4098     {
4099     case coding_type_undecided:
4100     case coding_type_emacs_mule:
4101     case coding_type_raw_text:
4102       if (eol_conversion)
4103         {
4104           while (begp < endp && *begp != '\n') begp++;
4105           while (begp < endp && endp[-1] != '\n') endp--;
4106         }
4107       else
4108         begp = endp;
4109       break;
4110
4111     case coding_type_iso2022:
4112       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4113         /* We can't skip any data.  */
4114         break;
4115       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4116         {
4117           unsigned char *bol = begp;
4118           while (begp < endp && *begp < 0x80)
4119             {
4120               begp++;
4121               if (begp[-1] == '\n')
4122                 bol = begp;
4123             }
4124           begp = bol;
4125           goto label_skip_tail;
4126         }
4127       /* fall down ... */
4128
4129     default:
4130       /* We can skip all ASCII characters at the head and tail.  */
4131       if (eol_conversion)
4132         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4133       else
4134         while (begp < endp && *begp < 0x80) begp++;
4135     label_skip_tail:
4136       if (eol_conversion)
4137         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4138       else
4139         while (begp < endp && *(endp - 1) < 0x80) endp--;
4140       break;
4141     }
4142
4143   *beg += begp - begp_orig;
4144   *end += endp - endp_orig;
4145   return;
4146 }
4147
4148 /* As shrinking conversion region requires some overhead, we don't try
4149    shrinking if the length of conversion region is less than this
4150    value.  */
4151 static int shrink_conversion_region_threshhold = 1024;
4152
4153 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4154   do {                                                                  \
4155     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4156       {                                                                 \
4157         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4158         else shrink_decoding_region (beg, end, coding, str);            \
4159       }                                                                 \
4160   } while (0)
4161
4162 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4163    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4164    coding system CODING, and return the status code of code conversion
4165    (currently, this value has no meaning).
4166
4167    How many characters (and bytes) are converted to how many
4168    characters (and bytes) are recorded in members of the structure
4169    CODING.
4170
4171    If REPLACE is nonzero, we do various things as if the original text
4172    is deleted and a new text is inserted.  See the comments in
4173    replace_range (insdel.c) to know what we are doing.  */
4174
4175 int
4176 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4177      int from, from_byte, to, to_byte, encodep, replace;
4178      struct coding_system *coding;
4179 {
4180   int len = to - from, len_byte = to_byte - from_byte;
4181   int require, inserted, inserted_byte;
4182   int head_skip, tail_skip, total_skip;
4183   Lisp_Object saved_coding_symbol;
4184   int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4185   int first = 1;
4186   int fake_multibyte = 0;
4187   unsigned char *src, *dst;
4188   Lisp_Object deletion;
4189   int orig_point = PT, orig_len = len;
4190   int prev_Z;
4191
4192   deletion = Qnil;
4193   saved_coding_symbol = Qnil;
4194
4195   if (from < PT && PT < to)
4196     {
4197       TEMP_SET_PT_BOTH (from, from_byte);
4198       orig_point = from;
4199     }
4200
4201   if (replace)
4202     {
4203       int saved_from = from;
4204
4205       prepare_to_modify_buffer (from, to, &from);
4206       if (saved_from != from)
4207         {
4208           to = from + len;
4209           if (multibyte)
4210             from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4211           else
4212             from_byte = from, to_byte = to;
4213           len_byte = to_byte - from_byte;
4214         }
4215     }
4216
4217   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4218     {
4219       /* We must detect encoding of text and eol format.  */
4220
4221       if (from < GPT && to > GPT)
4222         move_gap_both (from, from_byte);
4223       if (coding->type == coding_type_undecided)
4224         {
4225           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4226           if (coding->type == coding_type_undecided)
4227             /* It seems that the text contains only ASCII, but we
4228                should not left it undecided because the deeper
4229                decoding routine (decode_coding) tries to detect the
4230                encodings again in vain.  */
4231             coding->type = coding_type_emacs_mule;
4232         }
4233       if (coding->eol_type == CODING_EOL_UNDECIDED)
4234         {
4235           saved_coding_symbol = coding->symbol;
4236           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4237           if (coding->eol_type == CODING_EOL_UNDECIDED)
4238             coding->eol_type = CODING_EOL_LF;
4239           /* We had better recover the original eol format if we
4240              encounter an inconsitent eol format while decoding.  */
4241           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4242         }
4243     }
4244
4245   coding->consumed_char = len, coding->consumed = len_byte;
4246
4247   if (encodep
4248       ? ! CODING_REQUIRE_ENCODING (coding)
4249       : ! CODING_REQUIRE_DECODING (coding))
4250     {
4251       coding->produced = len_byte;
4252       if (multibyte
4253           && ! replace
4254           /* See the comment of the member heading_ascii in coding.h.  */
4255           && coding->heading_ascii < len_byte)
4256         {
4257           /* We still may have to combine byte at the head and the
4258              tail of the text in the region.  */
4259           if (from < GPT && GPT < to)
4260             move_gap_both (to, to_byte);
4261           len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4262           adjust_after_insert (from, from_byte, to, to_byte, len);
4263           coding->produced_char = len;
4264         }
4265       else
4266         {
4267           if (!replace)
4268             adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4269           coding->produced_char = len_byte;
4270         }
4271       return 0;
4272     }
4273
4274   /* Now we convert the text.  */
4275
4276   /* For encoding, we must process pre-write-conversion in advance.  */
4277   if (encodep
4278       && ! NILP (coding->pre_write_conversion)
4279       && SYMBOLP (coding->pre_write_conversion)
4280       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4281     {
4282       /* The function in pre-write-conversion may put a new text in a
4283          new buffer.  */
4284       struct buffer *prev = current_buffer;
4285       Lisp_Object new;
4286
4287       call2 (coding->pre_write_conversion,
4288              make_number (from), make_number (to));
4289       if (current_buffer != prev)
4290         {
4291           len = ZV - BEGV;
4292           new = Fcurrent_buffer ();
4293           set_buffer_internal_1 (prev);
4294           del_range_2 (from, from_byte, to, to_byte);
4295           TEMP_SET_PT_BOTH (from, from_byte);
4296           insert_from_buffer (XBUFFER (new), 1, len, 0);
4297           Fkill_buffer (new);
4298           if (orig_point >= to)
4299             orig_point += len - orig_len;
4300           else if (orig_point > from)
4301             orig_point = from;
4302           orig_len = len;
4303           to = from + len;
4304           from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
4305           to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4306           len_byte = to_byte - from_byte;
4307           TEMP_SET_PT_BOTH (from, from_byte);
4308         }
4309     }
4310
4311   if (replace)
4312     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4313
4314   /* Try to skip the heading and tailing ASCIIs.  */
4315   {
4316     int from_byte_orig = from_byte, to_byte_orig = to_byte;
4317
4318     if (from < GPT && GPT < to)
4319       move_gap_both (from, from_byte);
4320     SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4321     if (from_byte == to_byte
4322         && coding->type != coding_type_ccl
4323         && ! (coding->mode & CODING_MODE_LAST_BLOCK
4324               && CODING_REQUIRE_FLUSHING (coding)))
4325       {
4326         coding->produced = len_byte;
4327         coding->produced_char = multibyte ? len : len_byte;
4328         if (!replace)
4329           /* We must record and adjust for this new text now.  */
4330           adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4331         return 0;
4332       }
4333
4334     head_skip = from_byte - from_byte_orig;
4335     tail_skip = to_byte_orig - to_byte;
4336     total_skip = head_skip + tail_skip;
4337     from += head_skip;
4338     to -= tail_skip;
4339     len -= total_skip; len_byte -= total_skip;
4340   }
4341
4342   /* The code conversion routine can not preserve text properties for
4343      now.  So, we must remove all text properties in the region.
4344      Here, we must suppress all modification hooks.  */
4345   if (replace)
4346     {
4347       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4348       inhibit_modification_hooks = 1;
4349       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4350       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4351     }
4352
4353   /* For converion, we must put the gap before the text in addition to
4354      making the gap larger for efficient decoding.  The required gap
4355      size starts from 2000 which is the magic number used in make_gap.
4356      But, after one batch of conversion, it will be incremented if we
4357      find that it is not enough .  */
4358   require = 2000;
4359
4360   if (GAP_SIZE  < require)
4361     make_gap (require - GAP_SIZE);
4362   move_gap_both (from, from_byte);
4363
4364   inserted = inserted_byte = 0;
4365   src = GAP_END_ADDR, dst = GPT_ADDR;
4366
4367   GAP_SIZE += len_byte;
4368   ZV -= len;
4369   Z -= len;
4370   ZV_BYTE -= len_byte;
4371   Z_BYTE -= len_byte;
4372
4373   if (GPT - BEG < beg_unchanged)
4374     beg_unchanged = GPT - BEG;
4375   if (Z - GPT < end_unchanged)
4376     end_unchanged = Z - GPT;
4377
4378   for (;;)
4379     {
4380       int result;
4381
4382       /* The buffer memory is changed from:
4383          +--------+converted-text+---------+-------original-text------+---+
4384          |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4385                   |<------------------- GAP_SIZE -------------------->|  */
4386       if (encodep)
4387         result = encode_coding (coding, src, dst, len_byte, 0);
4388       else
4389         result = decode_coding (coding, src, dst, len_byte, 0);
4390       /* to:
4391          +--------+-------converted-text--------+--+---original-text--+---+
4392          |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4393                   |<------------------- GAP_SIZE -------------------->|  */
4394       if (coding->fake_multibyte)
4395         fake_multibyte = 1;
4396
4397       if (!encodep && !multibyte)
4398         coding->produced_char = coding->produced;
4399       inserted += coding->produced_char;
4400       inserted_byte += coding->produced;
4401       len_byte -= coding->consumed;
4402       src += coding->consumed;
4403       dst += inserted_byte;
4404
4405       if (result == CODING_FINISH_NORMAL)
4406         {
4407           src += len_byte;
4408           break;
4409         }
4410       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4411         {
4412           unsigned char *pend = dst, *p = pend - inserted_byte;
4413
4414           /* Encode LFs back to the original eol format (CR or CRLF).  */
4415           if (coding->eol_type == CODING_EOL_CR)
4416             {
4417               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4418             }
4419           else
4420             {
4421               int count = 0;
4422
4423               while (p < pend) if (*p++ == '\n') count++;
4424               if (src - dst < count)
4425                 {
4426                   /* We don't have sufficient room for putting LFs
4427                      back to CRLF.  We must record converted and
4428                      not-yet-converted text back to the buffer
4429                      content, enlarge the gap, then record them out of
4430                      the buffer contents again.  */
4431                   int add = len_byte + inserted_byte;
4432
4433                   GAP_SIZE -= add;
4434                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4435                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4436                   make_gap (count - GAP_SIZE);
4437                   GAP_SIZE += add;
4438                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4439                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4440                   /* Don't forget to update SRC, DST, and PEND.  */
4441                   src = GAP_END_ADDR - len_byte;
4442                   dst = GPT_ADDR + inserted_byte;
4443                   pend = dst;
4444                 }
4445               inserted += count;
4446               inserted_byte += count;
4447               coding->produced += count;
4448               p = dst = pend + count;
4449               while (count)
4450                 {
4451                   *--p = *--pend;
4452                   if (*p == '\n') count--, *--p = '\r';
4453                 }
4454             }
4455
4456           /* Suppress eol-format conversion in the further conversion.  */
4457           coding->eol_type = CODING_EOL_LF;
4458
4459           /* Restore the original symbol.  */
4460           coding->symbol = saved_coding_symbol;
4461
4462           continue;
4463         }
4464       if (len_byte <= 0)
4465         {
4466           if (coding->type != coding_type_ccl
4467               || coding->mode & CODING_MODE_LAST_BLOCK)
4468             break;
4469           coding->mode |= CODING_MODE_LAST_BLOCK;
4470           continue;
4471         }
4472       if (result == CODING_FINISH_INSUFFICIENT_SRC)
4473         {
4474           /* The source text ends in invalid codes.  Let's just
4475              make them valid buffer contents, and finish conversion.  */
4476           inserted += len_byte;
4477           inserted_byte += len_byte;
4478           while (len_byte--)
4479             *dst++ = *src++;
4480           fake_multibyte = 1;
4481           break;
4482         }
4483       if (result == CODING_FINISH_INTERRUPT)
4484         {
4485           /* The conversion procedure was interrupted by a user.  */
4486           fake_multibyte = 1;
4487           break;
4488         }
4489       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
4490       if (coding->consumed < 1)
4491         {
4492           /* It's quite strange to require more memory without
4493              consuming any bytes.  Perhaps CCL program bug.  */
4494           fake_multibyte = 1;
4495           break;
4496         }
4497       if (first)
4498         {
4499           /* We have just done the first batch of conversion which was
4500              stoped because of insufficient gap.  Let's reconsider the
4501              required gap size (i.e. SRT - DST) now.
4502
4503              We have converted ORIG bytes (== coding->consumed) into
4504              NEW bytes (coding->produced).  To convert the remaining
4505              LEN bytes, we may need REQUIRE bytes of gap, where:
4506                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4507                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4508              Here, we are sure that NEW >= ORIG.  */
4509           float ratio = coding->produced - coding->consumed;
4510           ratio /= coding->consumed;
4511           require = len_byte * ratio;
4512           first = 0;
4513         }
4514       if ((src - dst) < (require + 2000))
4515         {
4516           /* See the comment above the previous call of make_gap.  */
4517           int add = len_byte + inserted_byte;
4518
4519           GAP_SIZE -= add;
4520           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4521           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4522           make_gap (require + 2000);
4523           GAP_SIZE += add;
4524           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4525           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4526           /* Don't forget to update SRC, DST.  */
4527           src = GAP_END_ADDR - len_byte;
4528           dst = GPT_ADDR + inserted_byte;
4529         }
4530     }
4531   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
4532
4533   if (multibyte
4534       && (encodep
4535           || fake_multibyte
4536           || (to - from) != (to_byte - from_byte)))
4537     inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
4538
4539   /* If we have shrinked the conversion area, adjust it now.  */
4540   if (total_skip > 0)
4541     {
4542       if (tail_skip > 0)
4543         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4544       inserted += total_skip; inserted_byte += total_skip;
4545       GAP_SIZE += total_skip;
4546       GPT -= head_skip; GPT_BYTE -= head_skip;
4547       ZV -= total_skip; ZV_BYTE -= total_skip;
4548       Z -= total_skip; Z_BYTE -= total_skip;
4549       from -= head_skip; from_byte -= head_skip;
4550       to += tail_skip; to_byte += tail_skip;
4551     }
4552
4553   prev_Z = Z;
4554   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4555   inserted = Z - prev_Z;
4556
4557   if (! encodep && ! NILP (coding->post_read_conversion))
4558     {
4559       Lisp_Object val;
4560
4561       if (from != PT)
4562         TEMP_SET_PT_BOTH (from, from_byte);
4563       prev_Z = Z;
4564       val = call1 (coding->post_read_conversion, make_number (inserted));
4565       CHECK_NUMBER (val, 0);
4566       inserted += Z - prev_Z;
4567     }
4568
4569   if (orig_point >= from)
4570     {
4571       if (orig_point >= from + orig_len)
4572         orig_point += inserted - orig_len;
4573       else
4574         orig_point = from;
4575       TEMP_SET_PT (orig_point);
4576     }
4577
4578   signal_after_change (from, to - from, inserted);
4579
4580   {
4581     coding->consumed = to_byte - from_byte;
4582     coding->consumed_char = to - from;
4583     coding->produced = inserted_byte;
4584     coding->produced_char = inserted;
4585   }
4586
4587   return 0;
4588 }
4589
4590 Lisp_Object
4591 code_convert_string (str, coding, encodep, nocopy)
4592      Lisp_Object str;
4593      struct coding_system *coding;
4594      int encodep, nocopy;
4595 {
4596   int len;
4597   char *buf;
4598   int from = 0, to = XSTRING (str)->size;
4599   int to_byte = STRING_BYTES (XSTRING (str));
4600   struct gcpro gcpro1;
4601   Lisp_Object saved_coding_symbol;
4602   int result;
4603
4604   saved_coding_symbol = Qnil;
4605   if (encodep && !NILP (coding->pre_write_conversion)
4606       || !encodep && !NILP (coding->post_read_conversion))
4607     {
4608       /* Since we have to call Lisp functions which assume target text
4609          is in a buffer, after setting a temporary buffer, call
4610          code_convert_region.  */
4611       int count = specpdl_ptr - specpdl;
4612       struct buffer *prev = current_buffer;
4613
4614       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4615       temp_output_buffer_setup (" *code-converting-work*");
4616       set_buffer_internal (XBUFFER (Vstandard_output));
4617       if (encodep)
4618         insert_from_string (str, 0, 0, to, to_byte, 0);
4619       else
4620         {
4621           /* We must insert the contents of STR as is without
4622              unibyte<->multibyte conversion.  */
4623           current_buffer->enable_multibyte_characters = Qnil;
4624           insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4625           current_buffer->enable_multibyte_characters = Qt;
4626         }
4627       code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
4628       if (encodep)
4629         /* We must return the buffer contents as unibyte string.  */
4630         current_buffer->enable_multibyte_characters = Qnil;
4631       str = make_buffer_string (BEGV, ZV, 0);
4632       set_buffer_internal (prev);
4633       return unbind_to (count, str);
4634     }
4635
4636   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4637     {
4638       /* See the comments in code_convert_region.  */
4639       if (coding->type == coding_type_undecided)
4640         {
4641           detect_coding (coding, XSTRING (str)->data, to_byte);
4642           if (coding->type == coding_type_undecided)
4643             coding->type = coding_type_emacs_mule;
4644         }
4645       if (coding->eol_type == CODING_EOL_UNDECIDED)
4646         {
4647           saved_coding_symbol = coding->symbol;
4648           detect_eol (coding, XSTRING (str)->data, to_byte);
4649           if (coding->eol_type == CODING_EOL_UNDECIDED)
4650             coding->eol_type = CODING_EOL_LF;
4651           /* We had better recover the original eol format if we
4652              encounter an inconsitent eol format while decoding.  */
4653           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4654         }
4655     }
4656
4657   if (encodep
4658       ? ! CODING_REQUIRE_ENCODING (coding)
4659       : ! CODING_REQUIRE_DECODING (coding))
4660     from = to_byte;
4661   else
4662     {
4663       /* Try to skip the heading and tailing ASCIIs.  */
4664       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4665                                 encodep);
4666     }
4667   if (from == to_byte
4668       && coding->type != coding_type_ccl)
4669     return (nocopy ? str : Fcopy_sequence (str));
4670
4671   if (encodep)
4672     len = encoding_buffer_size (coding, to_byte - from);
4673   else
4674     len = decoding_buffer_size (coding, to_byte - from);
4675   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
4676   GCPRO1 (str);
4677   buf = get_conversion_buffer (len);
4678   UNGCPRO;
4679
4680   if (from > 0)
4681     bcopy (XSTRING (str)->data, buf, from);
4682   result = (encodep
4683             ? encode_coding (coding, XSTRING (str)->data + from,
4684                              buf + from, to_byte - from, len)
4685             : decode_coding (coding, XSTRING (str)->data + from,
4686                              buf + from, to_byte - from, len));
4687   if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4688     {
4689       /* We simple try to decode the whole string again but without
4690          eol-conversion this time.  */
4691       coding->eol_type = CODING_EOL_LF;
4692       coding->symbol = saved_coding_symbol;
4693       return code_convert_string (str, coding, encodep, nocopy);
4694     }
4695
4696   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
4697          STRING_BYTES (XSTRING (str)) - to_byte);
4698
4699   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
4700   if (encodep)
4701     str = make_unibyte_string (buf, len + coding->produced);
4702   else
4703     {
4704       int chars= (coding->fake_multibyte
4705                   ? multibyte_chars_in_text (buf + from, coding->produced)
4706                   : coding->produced_char);
4707       str = make_multibyte_string (buf, len + chars, len + coding->produced);
4708     }
4709
4710   return str;
4711 }
4712
4713 \f
4714 #ifdef emacs
4715 /*** 8. Emacs Lisp library functions ***/
4716
4717 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4718   "Return t if OBJECT is nil or a coding-system.\n\
4719 See the documentation of `make-coding-system' for information\n\
4720 about coding-system objects.")
4721   (obj)
4722      Lisp_Object obj;
4723 {
4724   if (NILP (obj))
4725     return Qt;
4726   if (!SYMBOLP (obj))
4727     return Qnil;
4728   /* Get coding-spec vector for OBJ.  */
4729   obj = Fget (obj, Qcoding_system);
4730   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4731           ? Qt : Qnil);
4732 }
4733
4734 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4735        Sread_non_nil_coding_system, 1, 1, 0,
4736   "Read a coding system from the minibuffer, prompting with string PROMPT.")
4737   (prompt)
4738      Lisp_Object prompt;
4739 {
4740   Lisp_Object val;
4741   do
4742     {
4743       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4744                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
4745     }
4746   while (XSTRING (val)->size == 0);
4747   return (Fintern (val, Qnil));
4748 }
4749
4750 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4751   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4752 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4753   (prompt, default_coding_system)
4754      Lisp_Object prompt, default_coding_system;
4755 {
4756   Lisp_Object val;
4757   if (SYMBOLP (default_coding_system))
4758     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4759   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4760                           Qt, Qnil, Qcoding_system_history,
4761                           default_coding_system, Qnil);
4762   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4763 }
4764
4765 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4766        1, 1, 0,
4767   "Check validity of CODING-SYSTEM.\n\
4768 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4769 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4770 The value of property should be a vector of length 5.")
4771   (coding_system)
4772      Lisp_Object coding_system;
4773 {
4774   CHECK_SYMBOL (coding_system, 0);
4775   if (!NILP (Fcoding_system_p (coding_system)))
4776     return coding_system;
4777   while (1)
4778     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4779 }
4780 \f
4781 Lisp_Object
4782 detect_coding_system (src, src_bytes, highest)
4783      unsigned char *src;
4784      int src_bytes, highest;
4785 {
4786   int coding_mask, eol_type;
4787   Lisp_Object val, tmp;
4788   int dummy;
4789
4790   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4791   eol_type  = detect_eol_type (src, src_bytes, &dummy);
4792   if (eol_type == CODING_EOL_INCONSISTENT)
4793     eol_type = CODING_EOL_UNDECIDED;
4794
4795   if (!coding_mask)
4796     {
4797       val = Qundecided;
4798       if (eol_type != CODING_EOL_UNDECIDED)
4799         {
4800           Lisp_Object val2;
4801           val2 = Fget (Qundecided, Qeol_type);
4802           if (VECTORP (val2))
4803             val = XVECTOR (val2)->contents[eol_type];
4804         }
4805       return (highest ? val : Fcons (val, Qnil));
4806     }
4807
4808   /* At first, gather possible coding systems in VAL.  */
4809   val = Qnil;
4810   for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4811     {
4812       int idx
4813         = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4814       if (coding_mask & (1 << idx))
4815         {
4816           val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4817           if (highest)
4818             break;
4819         }
4820     }
4821   if (!highest)
4822     val = Fnreverse (val);
4823
4824   /* Then, replace the elements with subsidiary coding systems.  */
4825   for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4826     {
4827       if (eol_type != CODING_EOL_UNDECIDED
4828           && eol_type != CODING_EOL_INCONSISTENT)
4829         {
4830           Lisp_Object eol;
4831           eol = Fget (XCONS (tmp)->car, Qeol_type);
4832           if (VECTORP (eol))
4833             XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4834         }
4835     }
4836   return (highest ? XCONS (val)->car : val);
4837 }
4838
4839 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4840        2, 3, 0,
4841   "Detect coding system of the text in the region between START and END.\n\
4842 Return a list of possible coding systems ordered by priority.\n\
4843 \n\
4844 If only ASCII characters are found, it returns a list of single element\n\
4845 `undecided' or its subsidiary coding system according to a detected\n\
4846 end-of-line format.\n\
4847 \n\
4848 If optional argument HIGHEST is non-nil, return the coding system of\n\
4849 highest priority.")
4850   (start, end, highest)
4851      Lisp_Object start, end, highest;
4852 {
4853   int from, to;
4854   int from_byte, to_byte;
4855
4856   CHECK_NUMBER_COERCE_MARKER (start, 0);
4857   CHECK_NUMBER_COERCE_MARKER (end, 1);
4858
4859   validate_region (&start, &end);
4860   from = XINT (start), to = XINT (end);
4861   from_byte = CHAR_TO_BYTE (from);
4862   to_byte = CHAR_TO_BYTE (to);
4863
4864   if (from < GPT && to >= GPT)
4865     move_gap_both (to, to_byte);
4866
4867   return detect_coding_system (BYTE_POS_ADDR (from_byte),
4868                                to_byte - from_byte,
4869                                !NILP (highest));
4870 }
4871
4872 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4873        1, 2, 0,
4874   "Detect coding system of the text in STRING.\n\
4875 Return a list of possible coding systems ordered by priority.\n\
4876 \n\
4877 If only ASCII characters are found, it returns a list of single element\n\
4878 `undecided' or its subsidiary coding system according to a detected\n\
4879 end-of-line format.\n\
4880 \n\
4881 If optional argument HIGHEST is non-nil, return the coding system of\n\
4882 highest priority.")
4883   (string, highest)
4884      Lisp_Object string, highest;
4885 {
4886   CHECK_STRING (string, 0);
4887
4888   return detect_coding_system (XSTRING (string)->data,
4889                                STRING_BYTES (XSTRING (string)),
4890                                !NILP (highest));
4891 }
4892
4893 Lisp_Object
4894 code_convert_region1 (start, end, coding_system, encodep)
4895      Lisp_Object start, end, coding_system;
4896      int encodep;
4897 {
4898   struct coding_system coding;
4899   int from, to, len;
4900
4901   CHECK_NUMBER_COERCE_MARKER (start, 0);
4902   CHECK_NUMBER_COERCE_MARKER (end, 1);
4903   CHECK_SYMBOL (coding_system, 2);
4904
4905   validate_region (&start, &end);
4906   from = XFASTINT (start);
4907   to = XFASTINT (end);
4908
4909   if (NILP (coding_system))
4910     return make_number (to - from);
4911
4912   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4913     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4914
4915   coding.mode |= CODING_MODE_LAST_BLOCK;
4916   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4917                        &coding, encodep, 1);
4918   Vlast_coding_system_used = coding.symbol;
4919   return make_number (coding.produced_char);
4920 }
4921
4922 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4923        3, 3, "r\nzCoding system: ",
4924   "Decode the current region by specified coding system.\n\
4925 When called from a program, takes three arguments:\n\
4926 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
4927 This function sets `last-coding-system-used' to the precise coding system\n\
4928 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4929 not fully specified.)\n\
4930 It returns the length of the decoded text.")
4931   (start, end, coding_system)
4932      Lisp_Object start, end, coding_system;
4933 {
4934   return code_convert_region1 (start, end, coding_system, 0);
4935 }
4936
4937 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4938        3, 3, "r\nzCoding system: ",
4939   "Encode the current region by specified coding system.\n\
4940 When called from a program, takes three arguments:\n\
4941 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
4942 This function sets `last-coding-system-used' to the precise coding system\n\
4943 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4944 not fully specified.)\n\
4945 It returns the length of the encoded text.")
4946   (start, end, coding_system)
4947      Lisp_Object start, end, coding_system;
4948 {
4949   return code_convert_region1 (start, end, coding_system, 1);
4950 }
4951
4952 Lisp_Object
4953 code_convert_string1 (string, coding_system, nocopy, encodep)
4954      Lisp_Object string, coding_system, nocopy;
4955      int encodep;
4956 {
4957   struct coding_system coding;
4958
4959   CHECK_STRING (string, 0);
4960   CHECK_SYMBOL (coding_system, 1);
4961
4962   if (NILP (coding_system))
4963     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4964
4965   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4966     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4967
4968   coding.mode |= CODING_MODE_LAST_BLOCK;
4969   Vlast_coding_system_used = coding.symbol;
4970   return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4971 }
4972
4973 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
4974        2, 3, 0,
4975   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4976 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4977 if the decoding operation is trivial.\n\
4978 This function sets `last-coding-system-used' to the precise coding system\n\
4979 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4980 not fully specified.)")
4981   (string, coding_system, nocopy)
4982      Lisp_Object string, coding_system, nocopy;
4983 {
4984   return code_convert_string1 (string, coding_system, nocopy, 0);
4985 }
4986
4987 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
4988        2, 3, 0,
4989   "Encode STRING to CODING-SYSTEM, and return the result.\n\
4990 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4991 if the encoding operation is trivial.\n\
4992 This function sets `last-coding-system-used' to the precise coding system\n\
4993 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4994 not fully specified.)")
4995   (string, coding_system, nocopy)
4996      Lisp_Object string, coding_system, nocopy;
4997 {
4998   return code_convert_string1 (string, coding_system, nocopy, 1);
4999 }
5000
5001 /* Encode or decode STRING according to CODING_SYSTEM.
5002    Do not set Vlast_coding_system_used.  */
5003
5004 Lisp_Object
5005 code_convert_string_norecord (string, coding_system, encodep)
5006      Lisp_Object string, coding_system;
5007      int encodep;
5008 {
5009   struct coding_system coding;
5010
5011   CHECK_STRING (string, 0);
5012   CHECK_SYMBOL (coding_system, 1);
5013
5014   if (NILP (coding_system))
5015     return string;
5016
5017   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5018     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5019
5020   coding.mode |= CODING_MODE_LAST_BLOCK;
5021   return code_convert_string (string, &coding, encodep, Qt);
5022 }
5023 \f
5024 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5025   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5026 Return the corresponding character.")
5027   (code)
5028      Lisp_Object code;
5029 {
5030   unsigned char c1, c2, s1, s2;
5031   Lisp_Object val;
5032
5033   CHECK_NUMBER (code, 0);
5034   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5035   if (s1 == 0)
5036     {
5037       if (s2 < 0x80)
5038         XSETFASTINT (val, s2);
5039       else if (s2 >= 0xA0 || s2 <= 0xDF)
5040         XSETFASTINT (val,
5041                      MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5042       else
5043         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5044     }
5045   else
5046     {
5047       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5048           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5049         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5050       DECODE_SJIS (s1, s2, c1, c2);
5051       XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5052     }
5053   return val;
5054 }
5055
5056 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5057   "Encode a Japanese character CHAR to shift_jis encoding.\n\
5058 Return the corresponding code in SJIS.")
5059   (ch)
5060      Lisp_Object ch;
5061 {
5062   int charset, c1, c2, s1, s2;
5063   Lisp_Object val;
5064
5065   CHECK_NUMBER (ch, 0);
5066   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5067   if (charset == CHARSET_ASCII)
5068     {
5069       val = ch;
5070     }
5071   else if (charset == charset_jisx0208
5072            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5073     {
5074       ENCODE_SJIS (c1, c2, s1, s2);
5075       XSETFASTINT (val, (s1 << 8) | s2);
5076     }
5077   else if (charset == charset_katakana_jisx0201
5078            && c1 > 0x20 && c2 < 0xE0)
5079     {
5080       XSETFASTINT (val, c1 | 0x80);
5081     }
5082   else
5083     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5084   return val;
5085 }
5086
5087 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5088   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5089 Return the corresponding character.")
5090   (code)
5091      Lisp_Object code;
5092 {
5093   int charset;
5094   unsigned char b1, b2, c1, c2;
5095   Lisp_Object val;
5096
5097   CHECK_NUMBER (code, 0);
5098   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5099   if (b1 == 0)
5100     {
5101       if (b2 >= 0x80)
5102         error ("Invalid BIG5 code: %x", XFASTINT (code));
5103       val = code;
5104     }
5105   else
5106     {
5107       if ((b1 < 0xA1 || b1 > 0xFE)
5108           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5109         error ("Invalid BIG5 code: %x", XFASTINT (code));
5110       DECODE_BIG5 (b1, b2, charset, c1, c2);
5111       XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5112     }
5113   return val;
5114 }
5115
5116 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5117   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5118 Return the corresponding character code in Big5.")
5119   (ch)
5120      Lisp_Object ch;
5121 {
5122   int charset, c1, c2, b1, b2;
5123   Lisp_Object val;
5124
5125   CHECK_NUMBER (ch, 0);
5126   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5127   if (charset == CHARSET_ASCII)
5128     {
5129       val = ch;
5130     }
5131   else if ((charset == charset_big5_1
5132             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5133            || (charset == charset_big5_2
5134                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5135     {
5136       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5137       XSETFASTINT (val, (b1 << 8) | b2);
5138     }
5139   else
5140     error ("Can't encode to Big5: %d", XFASTINT (ch));
5141   return val;
5142 }
5143 \f
5144 DEFUN ("set-terminal-coding-system-internal",
5145        Fset_terminal_coding_system_internal,
5146        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5147   (coding_system)
5148      Lisp_Object coding_system;
5149 {
5150   CHECK_SYMBOL (coding_system, 0);
5151   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5152   /* We had better not send unsafe characters to terminal.  */
5153   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5154
5155   return Qnil;
5156 }
5157
5158 DEFUN ("set-safe-terminal-coding-system-internal",
5159        Fset_safe_terminal_coding_system_internal,
5160        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5161   (coding_system)
5162      Lisp_Object coding_system;
5163 {
5164   CHECK_SYMBOL (coding_system, 0);
5165   setup_coding_system (Fcheck_coding_system (coding_system),
5166                        &safe_terminal_coding);
5167   return Qnil;
5168 }
5169
5170 DEFUN ("terminal-coding-system",
5171        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5172   "Return coding system specified for terminal output.")
5173   ()
5174 {
5175   return terminal_coding.symbol;
5176 }
5177
5178 DEFUN ("set-keyboard-coding-system-internal",
5179        Fset_keyboard_coding_system_internal,
5180        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5181   (coding_system)
5182      Lisp_Object coding_system;
5183 {
5184   CHECK_SYMBOL (coding_system, 0);
5185   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5186   return Qnil;
5187 }
5188
5189 DEFUN ("keyboard-coding-system",
5190        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5191   "Return coding system specified for decoding keyboard input.")
5192   ()
5193 {
5194   return keyboard_coding.symbol;
5195 }
5196
5197 \f
5198 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5199        Sfind_operation_coding_system,  1, MANY, 0,
5200   "Choose a coding system for an operation based on the target name.\n\
5201 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5202 DECODING-SYSTEM is the coding system to use for decoding\n\
5203 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5204 for encoding (in case OPERATION does encoding).\n\
5205 \n\
5206 The first argument OPERATION specifies an I/O primitive:\n\
5207   For file I/O, `insert-file-contents' or `write-region'.\n\
5208   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5209   For network I/O, `open-network-stream'.\n\
5210 \n\
5211 The remaining arguments should be the same arguments that were passed\n\
5212 to the primitive.  Depending on which primitive, one of those arguments\n\
5213 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5214 whichever argument specifies the file name is TARGET.\n\
5215 \n\
5216 TARGET has a meaning which depends on OPERATION:\n\
5217   For file I/O, TARGET is a file name.\n\
5218   For process I/O, TARGET is a process name.\n\
5219   For network I/O, TARGET is a service name or a port number\n\
5220 \n\
5221 This function looks up what specified for TARGET in,\n\
5222 `file-coding-system-alist', `process-coding-system-alist',\n\
5223 or `network-coding-system-alist' depending on OPERATION.\n\
5224 They may specify a coding system, a cons of coding systems,\n\
5225 or a function symbol to call.\n\
5226 In the last case, we call the function with one argument,\n\
5227 which is a list of all the arguments given to this function.")
5228   (nargs, args)
5229      int nargs;
5230      Lisp_Object *args;
5231 {
5232   Lisp_Object operation, target_idx, target, val;
5233   register Lisp_Object chain;
5234
5235   if (nargs < 2)
5236     error ("Too few arguments");
5237   operation = args[0];
5238   if (!SYMBOLP (operation)
5239       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5240     error ("Invalid first arguement");
5241   if (nargs < 1 + XINT (target_idx))
5242     error ("Too few arguments for operation: %s",
5243            XSYMBOL (operation)->name->data);
5244   target = args[XINT (target_idx) + 1];
5245   if (!(STRINGP (target)
5246         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5247     error ("Invalid %dth argument", XINT (target_idx) + 1);
5248
5249   chain = ((EQ (operation, Qinsert_file_contents)
5250             || EQ (operation, Qwrite_region))
5251            ? Vfile_coding_system_alist
5252            : (EQ (operation, Qopen_network_stream)
5253               ? Vnetwork_coding_system_alist
5254               : Vprocess_coding_system_alist));
5255   if (NILP (chain))
5256     return Qnil;
5257
5258   for (; CONSP (chain); chain = XCONS (chain)->cdr)
5259     {
5260       Lisp_Object elt;
5261       elt = XCONS (chain)->car;
5262
5263       if (CONSP (elt)
5264           && ((STRINGP (target)
5265                && STRINGP (XCONS (elt)->car)
5266                && fast_string_match (XCONS (elt)->car, target) >= 0)
5267               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
5268         {
5269           val = XCONS (elt)->cdr;
5270           /* Here, if VAL is both a valid coding system and a valid
5271              function symbol, we return VAL as a coding system.  */
5272           if (CONSP (val))
5273             return val;
5274           if (! SYMBOLP (val))
5275             return Qnil;
5276           if (! NILP (Fcoding_system_p (val)))
5277             return Fcons (val, val);
5278           if (! NILP (Ffboundp (val)))
5279             {
5280               val = call1 (val, Flist (nargs, args));
5281               if (CONSP (val))
5282                 return val;
5283               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5284                 return Fcons (val, val);
5285             }
5286           return Qnil;
5287         }
5288     }
5289   return Qnil;
5290 }
5291
5292 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5293        Supdate_coding_systems_internal, 0, 0, 0,
5294   "Update internal database for ISO2022 and CCL based coding systems.\n\
5295 When values of the following coding categories are changed, you must\n\
5296 call this function:\n\
5297   coding-category-iso-7, coding-category-iso-7-tight,\n\
5298   coding-category-iso-8-1, coding-category-iso-8-2,\n\
5299   coding-category-iso-7-else, coding-category-iso-8-else,\n\
5300   coding-category-ccl")
5301   ()
5302 {
5303   int i;
5304
5305   for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
5306     {
5307       Lisp_Object val;
5308
5309       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5310       if (!NILP (val))
5311         {
5312           if (! coding_system_table[i])
5313             coding_system_table[i] = ((struct coding_system *)
5314                                       xmalloc (sizeof (struct coding_system)));
5315           setup_coding_system (val, coding_system_table[i]);
5316         }
5317       else if (coding_system_table[i])
5318         {
5319           xfree (coding_system_table[i]);
5320           coding_system_table[i] = NULL;
5321         }
5322     }
5323
5324   return Qnil;
5325 }
5326
5327 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5328        Sset_coding_priority_internal, 0, 0, 0,
5329   "Update internal database for the current value of `coding-category-list'.\n\
5330 This function is internal use only.")
5331   ()
5332 {
5333   int i = 0, idx;
5334   Lisp_Object val;
5335
5336   val = Vcoding_category_list;
5337
5338   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5339     {
5340       if (! SYMBOLP (XCONS (val)->car))
5341         break;
5342       idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
5343       if (idx >= CODING_CATEGORY_IDX_MAX)
5344         break;
5345       coding_priorities[i++] = (1 << idx);
5346       val = XCONS (val)->cdr;
5347     }
5348   /* If coding-category-list is valid and contains all coding
5349      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
5350      the following code saves Emacs from craching.  */
5351   while (i < CODING_CATEGORY_IDX_MAX)
5352     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5353
5354   return Qnil;
5355 }
5356
5357 #endif /* emacs */
5358
5359 \f
5360 /*** 9. Post-amble ***/
5361
5362 void
5363 init_coding ()
5364 {
5365   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5366 }
5367
5368 void
5369 init_coding_once ()
5370 {
5371   int i;
5372
5373   /* Emacs' internal format specific initialize routine.  */
5374   for (i = 0; i <= 0x20; i++)
5375     emacs_code_class[i] = EMACS_control_code;
5376   emacs_code_class[0x0A] = EMACS_linefeed_code;
5377   emacs_code_class[0x0D] = EMACS_carriage_return_code;
5378   for (i = 0x21 ; i < 0x7F; i++)
5379     emacs_code_class[i] = EMACS_ascii_code;
5380   emacs_code_class[0x7F] = EMACS_control_code;
5381   emacs_code_class[0x80] = EMACS_leading_code_composition;
5382   for (i = 0x81; i < 0xFF; i++)
5383     emacs_code_class[i] = EMACS_invalid_code;
5384   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5385   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5386   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5387   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5388
5389   /* ISO2022 specific initialize routine.  */
5390   for (i = 0; i < 0x20; i++)
5391     iso_code_class[i] = ISO_control_code;
5392   for (i = 0x21; i < 0x7F; i++)
5393     iso_code_class[i] = ISO_graphic_plane_0;
5394   for (i = 0x80; i < 0xA0; i++)
5395     iso_code_class[i] = ISO_control_code;
5396   for (i = 0xA1; i < 0xFF; i++)
5397     iso_code_class[i] = ISO_graphic_plane_1;
5398   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5399   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5400   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5401   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5402   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5403   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5404   iso_code_class[ISO_CODE_ESC] = ISO_escape;
5405   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5406   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5407   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5408
5409   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5410
5411   setup_coding_system (Qnil, &keyboard_coding);
5412   setup_coding_system (Qnil, &terminal_coding);
5413   setup_coding_system (Qnil, &safe_terminal_coding);
5414   setup_coding_system (Qnil, &default_buffer_file_coding);
5415
5416   bzero (coding_system_table, sizeof coding_system_table);
5417
5418   bzero (ascii_skip_code, sizeof ascii_skip_code);
5419   for (i = 0; i < 128; i++)
5420     ascii_skip_code[i] = 1;
5421
5422 #if defined (MSDOS) || defined (WINDOWSNT)
5423   system_eol_type = CODING_EOL_CRLF;
5424 #else
5425   system_eol_type = CODING_EOL_LF;
5426 #endif
5427 }
5428
5429 #ifdef emacs
5430
5431 void
5432 syms_of_coding ()
5433 {
5434   Qtarget_idx = intern ("target-idx");
5435   staticpro (&Qtarget_idx);
5436
5437   Qcoding_system_history = intern ("coding-system-history");
5438   staticpro (&Qcoding_system_history);
5439   Fset (Qcoding_system_history, Qnil);
5440
5441   /* Target FILENAME is the first argument.  */
5442   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5443   /* Target FILENAME is the third argument.  */
5444   Fput (Qwrite_region, Qtarget_idx, make_number (2));
5445
5446   Qcall_process = intern ("call-process");
5447   staticpro (&Qcall_process);
5448   /* Target PROGRAM is the first argument.  */
5449   Fput (Qcall_process, Qtarget_idx, make_number (0));
5450
5451   Qcall_process_region = intern ("call-process-region");
5452   staticpro (&Qcall_process_region);
5453   /* Target PROGRAM is the third argument.  */
5454   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5455
5456   Qstart_process = intern ("start-process");
5457   staticpro (&Qstart_process);
5458   /* Target PROGRAM is the third argument.  */
5459   Fput (Qstart_process, Qtarget_idx, make_number (2));
5460
5461   Qopen_network_stream = intern ("open-network-stream");
5462   staticpro (&Qopen_network_stream);
5463   /* Target SERVICE is the fourth argument.  */
5464   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5465
5466   Qcoding_system = intern ("coding-system");
5467   staticpro (&Qcoding_system);
5468
5469   Qeol_type = intern ("eol-type");
5470   staticpro (&Qeol_type);
5471
5472   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5473   staticpro (&Qbuffer_file_coding_system);
5474
5475   Qpost_read_conversion = intern ("post-read-conversion");
5476   staticpro (&Qpost_read_conversion);
5477
5478   Qpre_write_conversion = intern ("pre-write-conversion");
5479   staticpro (&Qpre_write_conversion);
5480
5481   Qno_conversion = intern ("no-conversion");
5482   staticpro (&Qno_conversion);
5483
5484   Qundecided = intern ("undecided");
5485   staticpro (&Qundecided);
5486
5487   Qcoding_system_p = intern ("coding-system-p");
5488   staticpro (&Qcoding_system_p);
5489
5490   Qcoding_system_error = intern ("coding-system-error");
5491   staticpro (&Qcoding_system_error);
5492
5493   Fput (Qcoding_system_error, Qerror_conditions,
5494         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5495   Fput (Qcoding_system_error, Qerror_message,
5496         build_string ("Invalid coding system"));
5497
5498   Qcoding_category = intern ("coding-category");
5499   staticpro (&Qcoding_category);
5500   Qcoding_category_index = intern ("coding-category-index");
5501   staticpro (&Qcoding_category_index);
5502
5503   Vcoding_category_table
5504     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5505   staticpro (&Vcoding_category_table);
5506   {
5507     int i;
5508     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5509       {
5510         XVECTOR (Vcoding_category_table)->contents[i]
5511           = intern (coding_category_name[i]);
5512         Fput (XVECTOR (Vcoding_category_table)->contents[i],
5513               Qcoding_category_index, make_number (i));
5514       }
5515   }
5516
5517   Qtranslation_table = intern ("translation-table");
5518   staticpro (&Qtranslation_table);
5519   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
5520
5521   Qtranslation_table_id = intern ("translation-table-id");
5522   staticpro (&Qtranslation_table_id);
5523
5524   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5525   staticpro (&Qtranslation_table_for_decode);
5526
5527   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5528   staticpro (&Qtranslation_table_for_encode);
5529
5530   Qsafe_charsets = intern ("safe-charsets");
5531   staticpro (&Qsafe_charsets);
5532
5533   Qvalid_codes = intern ("valid-codes");
5534   staticpro (&Qvalid_codes);
5535
5536   Qemacs_mule = intern ("emacs-mule");
5537   staticpro (&Qemacs_mule);
5538
5539   Qraw_text = intern ("raw-text");
5540   staticpro (&Qraw_text);
5541
5542   defsubr (&Scoding_system_p);
5543   defsubr (&Sread_coding_system);
5544   defsubr (&Sread_non_nil_coding_system);
5545   defsubr (&Scheck_coding_system);
5546   defsubr (&Sdetect_coding_region);
5547   defsubr (&Sdetect_coding_string);
5548   defsubr (&Sdecode_coding_region);
5549   defsubr (&Sencode_coding_region);
5550   defsubr (&Sdecode_coding_string);
5551   defsubr (&Sencode_coding_string);
5552   defsubr (&Sdecode_sjis_char);
5553   defsubr (&Sencode_sjis_char);
5554   defsubr (&Sdecode_big5_char);
5555   defsubr (&Sencode_big5_char);
5556   defsubr (&Sset_terminal_coding_system_internal);
5557   defsubr (&Sset_safe_terminal_coding_system_internal);
5558   defsubr (&Sterminal_coding_system);
5559   defsubr (&Sset_keyboard_coding_system_internal);
5560   defsubr (&Skeyboard_coding_system);
5561   defsubr (&Sfind_operation_coding_system);
5562   defsubr (&Supdate_coding_systems_internal);
5563   defsubr (&Sset_coding_priority_internal);
5564
5565   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5566     "List of coding systems.\n\
5567 \n\
5568 Do not alter the value of this variable manually.  This variable should be\n\
5569 updated by the functions `make-coding-system' and\n\
5570 `define-coding-system-alias'.");
5571   Vcoding_system_list = Qnil;
5572
5573   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5574     "Alist of coding system names.\n\
5575 Each element is one element list of coding system name.\n\
5576 This variable is given to `completing-read' as TABLE argument.\n\
5577 \n\
5578 Do not alter the value of this variable manually.  This variable should be\n\
5579 updated by the functions `make-coding-system' and\n\
5580 `define-coding-system-alias'.");
5581   Vcoding_system_alist = Qnil;
5582
5583   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5584     "List of coding-categories (symbols) ordered by priority.");
5585   {
5586     int i;
5587
5588     Vcoding_category_list = Qnil;
5589     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5590       Vcoding_category_list
5591         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5592                  Vcoding_category_list);
5593   }
5594
5595   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
5596     "Specify the coding system for read operations.\n\
5597 It is useful to bind this variable with `let', but do not set it globally.\n\
5598 If the value is a coding system, it is used for decoding on read operation.\n\
5599 If not, an appropriate element is used from one of the coding system alists:\n\
5600 There are three such tables, `file-coding-system-alist',\n\
5601 `process-coding-system-alist', and `network-coding-system-alist'.");
5602   Vcoding_system_for_read = Qnil;
5603
5604   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
5605     "Specify the coding system for write operations.\n\
5606 It is useful to bind this variable with `let', but do not set it globally.\n\
5607 If the value is a coding system, it is used for encoding on write operation.\n\
5608 If not, an appropriate element is used from one of the coding system alists:\n\
5609 There are three such tables, `file-coding-system-alist',\n\
5610 `process-coding-system-alist', and `network-coding-system-alist'.");
5611   Vcoding_system_for_write = Qnil;
5612
5613   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
5614     "Coding system used in the latest file or process I/O.");
5615   Vlast_coding_system_used = Qnil;
5616
5617   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
5618     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
5619 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5620 such conversion.");
5621   inhibit_eol_conversion = 0;
5622
5623   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5624     "Non-nil means process buffer inherits coding system of process output.\n\
5625 Bind it to t if the process output is to be treated as if it were a file\n\
5626 read from some filesystem.");
5627   inherit_process_coding_system = 0;
5628
5629   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5630     "Alist to decide a coding system to use for a file I/O operation.\n\
5631 The format is ((PATTERN . VAL) ...),\n\
5632 where PATTERN is a regular expression matching a file name,\n\
5633 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5634 If VAL is a coding system, it is used for both decoding and encoding\n\
5635 the file contents.\n\
5636 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5637 and the cdr part is used for encoding.\n\
5638 If VAL is a function symbol, the function must return a coding system\n\
5639 or a cons of coding systems which are used as above.\n\
5640 \n\
5641 See also the function `find-operation-coding-system'\n\
5642 and the variable `auto-coding-alist'.");
5643   Vfile_coding_system_alist = Qnil;
5644
5645   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5646     "Alist to decide a coding system to use for a process I/O operation.\n\
5647 The format is ((PATTERN . VAL) ...),\n\
5648 where PATTERN is a regular expression matching a program name,\n\
5649 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5650 If VAL is a coding system, it is used for both decoding what received\n\
5651 from the program and encoding what sent to the program.\n\
5652 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5653 and the cdr part is used for encoding.\n\
5654 If VAL is a function symbol, the function must return a coding system\n\
5655 or a cons of coding systems which are used as above.\n\
5656 \n\
5657 See also the function `find-operation-coding-system'.");
5658   Vprocess_coding_system_alist = Qnil;
5659
5660   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5661     "Alist to decide a coding system to use for a network I/O operation.\n\
5662 The format is ((PATTERN . VAL) ...),\n\
5663 where PATTERN is a regular expression matching a network service name\n\
5664 or is a port number to connect to,\n\
5665 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5666 If VAL is a coding system, it is used for both decoding what received\n\
5667 from the network stream and encoding what sent to the network stream.\n\
5668 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5669 and the cdr part is used for encoding.\n\
5670 If VAL is a function symbol, the function must return a coding system\n\
5671 or a cons of coding systems which are used as above.\n\
5672 \n\
5673 See also the function `find-operation-coding-system'.");
5674   Vnetwork_coding_system_alist = Qnil;
5675
5676   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5677     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5678   eol_mnemonic_unix = build_string (":");
5679
5680   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5681     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5682   eol_mnemonic_dos = build_string ("\\");
5683
5684   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5685     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5686   eol_mnemonic_mac = build_string ("/");
5687
5688   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5689     "*String displayed in mode line when end-of-line format is not yet determined.");
5690   eol_mnemonic_undecided = build_string (":");
5691
5692   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
5693     "*Non-nil enables character translation while encoding and decoding.");
5694   Venable_character_translation = Qt;
5695
5696   DEFVAR_LISP ("standard-translation-table-for-decode",
5697     &Vstandard_translation_table_for_decode,
5698     "Table for translating characters while decoding.");
5699   Vstandard_translation_table_for_decode = Qnil;
5700
5701   DEFVAR_LISP ("standard-translation-table-for-encode",
5702     &Vstandard_translation_table_for_encode,
5703     "Table for translationg characters while encoding.");
5704   Vstandard_translation_table_for_encode = Qnil;
5705
5706   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5707     "Alist of charsets vs revision numbers.\n\
5708 While encoding, if a charset (car part of an element) is found,\n\
5709 designate it with the escape sequence identifing revision (cdr part of the element).");
5710   Vcharset_revision_alist = Qnil;
5711
5712   DEFVAR_LISP ("default-process-coding-system",
5713                &Vdefault_process_coding_system,
5714     "Cons of coding systems used for process I/O by default.\n\
5715 The car part is used for decoding a process output,\n\
5716 the cdr part is used for encoding a text to be sent to a process.");
5717   Vdefault_process_coding_system = Qnil;
5718
5719   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5720     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5721 This is a vector of length 256.\n\
5722 If Nth element is non-nil, the existence of code N in a file\n\
5723 \(or output of subprocess) doesn't prevent it to be detected as\n\
5724 a coding system of ISO 2022 variant which has a flag\n\
5725 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5726 or reading output of a subprocess.\n\
5727 Only 128th through 159th elements has a meaning.");
5728   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
5729
5730   DEFVAR_LISP ("select-safe-coding-system-function",
5731                &Vselect_safe_coding_system_function,
5732     "Function to call to select safe coding system for encoding a text.\n\
5733 \n\
5734 If set, this function is called to force a user to select a proper\n\
5735 coding system which can encode the text in the case that a default\n\
5736 coding system used in each operation can't encode the text.\n\
5737 \n\
5738 The default value is `select-safe-coding-system' (which see).");
5739   Vselect_safe_coding_system_function = Qnil;
5740
5741 }
5742
5743 #endif /* emacs */