src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Other
  71
  72   If a user wants to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represents a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See section 6 for more details.
  81
  82 */
  83
  84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  85
  86   How end-of-line of a text is encoded depends on a system.  For
  87   instance, Unix's format is just one byte of `line-feed' code,
  88   whereas DOS's format is two-byte sequence of `carriage-return' and
  89   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  90
  91   Since text characters encoding and end-of-line encoding are
  92   independent, any coding system described above can take
  93   any format of end-of-line.  So, Emacs has information of format of
  94   end-of-line in each coding-system.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
  99
 100   These functions check if a text between SRC and SRC_END is encoded
 101   in the coding system category XXX.  Each returns an integer value in
 102   which appropriate flag bits for the category XXX is set.  The flag
 103   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 104   template of these functions.  */
 105 #if 0
 106 int
 107 detect_coding_emacs_mule (src, src_end)
 108      unsigned char *src, *src_end;
 109 {
 110   ...
 111 }
 112 #endif
 113
 114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 115
 116   These functions decode SRC_BYTES length text at SOURCE encoded in
 117   CODING to Emacs' internal format (emacs-mule).  The resulting text
 118   goes to a place pointed to by DESTINATION, the length of which should
 119   not exceed DST_BYTES.  The number of bytes actually processed is
 120   returned as *CONSUMED.  The return value is the length of the decoded
 121   text.  Below is a template of these functions.  */
 122 #if 0
 123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 124      struct coding_system *coding;
 125      unsigned char *source, *destination;
 126      int src_bytes, dst_bytes;
 127      int *consumed;
 128 {
 129   ...
 130 }
 131 #endif
 132
 133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 134
 135   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 136   internal format (emacs-mule) to CODING.  The resulting text goes to
 137   a place pointed to by DESTINATION, the length of which should not
 138   exceed DST_BYTES.  The number of bytes actually processed is
 139   returned as *CONSUMED.  The return value is the length of the
 140   encoded text.  Below is a template of these functions.  */
 141 #if 0
 142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 143      struct coding_system *coding;
 144      unsigned char *source, *destination;
 145      int src_bytes, dst_bytes;
 146      int *consumed;
 147 {
 148   ...
 149 }
 150 #endif
 151
 152 /*** COMMONLY USED MACROS ***/
 153
 154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 155    THREE_MORE_BYTES safely get one, two, and three bytes from the
 156    source text respectively.  If there are not enough bytes in the
 157    source, they jump to `label_end_of_loop'.  The caller should set
 158    variables `src' and `src_end' to appropriate areas in advance.  */
 159
 160 #define ONE_MORE_BYTE(c1)       \
 161   do {                          \
 162     if (src < src_end)          \
 163       c1 = *src++;              \
 164     else                        \
 165       goto label_end_of_loop;   \
 166   } while (0)
 167
 168 #define TWO_MORE_BYTES(c1, c2)  \
 169   do {                          \
 170     if (src + 1 < src_end)      \
 171       c1 = *src++, c2 = *src++; \
 172     else                        \
 173       goto label_end_of_loop;   \
 174   } while (0)
 175
 176 #define THREE_MORE_BYTES(c1, c2, c3)            \
 177   do {                                          \
 178     if (src + 2 < src_end)                      \
 179       c1 = *src++, c2 = *src++, c3 = *src++;    \
 180     else                                        \
 181       goto label_end_of_loop;                   \
 182   } while (0)
 183
 184 /* The following three macros DECODE_CHARACTER_ASCII,
 185    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 186    the multi-byte form of a character of each class at the place
 187    pointed by `dst'.  The caller should set the variable `dst' to
 188    point to an appropriate area and the variable `coding' to point to
 189    the coding-system of the currently decoding text in advance.  */
 190
 191 /* Decode one ASCII character C.  */
 192
 193 #define DECODE_CHARACTER_ASCII(c)                               \
 194   do {                                                          \
 195     if (COMPOSING_P (coding->composing))                        \
 196       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 197     else                                                        \
 198       *dst++ = (c);                                             \
 199   } while (0)
 200
 201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 202    position-code is C.  */
 203
 204 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 205   do {                                                                  \
 206     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 207     if (COMPOSING_P (coding->composing))                                \
 208       *dst++ = leading_code + 0x20;                                     \
 209     else                                                                \
 210       *dst++ = leading_code;                                            \
 211     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 212       *dst++ = leading_code;                                            \
 213     *dst++ = (c) | 0x80;                                                \
 214   } while (0)
 215
 216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 217    position-codes are C1 and C2.  */
 218
 219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 220   do {                                                  \
 221     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 222     *dst++ = (c2) | 0x80;                               \
 223   } while (0)
 224
 225 \f
 226 /*** 1. Preamble ***/
 227
 228 #include <stdio.h>
 229
 230 #ifdef emacs
 231
 232 #include <config.h>
 233 #include "lisp.h"
 234 #include "buffer.h"
 235 #include "charset.h"
 236 #include "ccl.h"
 237 #include "coding.h"
 238 #include "window.h"
 239
 240 #else  /* not emacs */
 241
 242 #include "mulelib.h"
 243
 244 #endif /* not emacs */
 245
 246 Lisp_Object Qcoding_system, Qeol_type;
 247 Lisp_Object Qbuffer_file_coding_system;
 248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 249
 250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 252 Lisp_Object Qstart_process, Qopen_network_stream;
 253 Lisp_Object Qtarget_idx;
 254
 255 /* Mnemonic character of each format of end-of-line.  */
 256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 257 /* Mnemonic character to indicate format of end-of-line is not yet
 258    decided.  */
 259 int eol_mnemonic_undecided;
 260
 261 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 262    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 263 int system_eol_type;
 264
 265 #ifdef emacs
 266
 267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 268
 269 /* Coding system emacs-mule is for converting only end-of-line format.  */
 270 Lisp_Object Qemacs_mule;
 271
 272 /* Coding-systems are handed between Emacs Lisp programs and C internal
 273    routines by the following three variables.  */
 274 /* Coding-system for reading files and receiving data from process.  */
 275 Lisp_Object Vcoding_system_for_read;
 276 /* Coding-system for writing files and sending data to process.  */
 277 Lisp_Object Vcoding_system_for_write;
 278 /* Coding-system actually used in the latest I/O.  */
 279 Lisp_Object Vlast_coding_system_used;
 280
 281 /* A vector of length 256 which contains information about special
 282    Latin codes (espepcially for dealing with Microsoft code).  */
 283 Lisp_Object Vlatin_extra_code_table;
 284
 285 /* Flag to inhibit code conversion of end-of-line format.  */
 286 int inhibit_eol_conversion;
 287
 288 /* Coding system to be used to encode text for terminal display.  */
 289 struct coding_system terminal_coding;
 290
 291 /* Coding system to be used to encode text for terminal display when
 292    terminal coding system is nil.  */
 293 struct coding_system safe_terminal_coding;
 294
 295 /* Coding system of what is sent from terminal keyboard.  */
 296 struct coding_system keyboard_coding;
 297
 298 Lisp_Object Vfile_coding_system_alist;
 299 Lisp_Object Vprocess_coding_system_alist;
 300 Lisp_Object Vnetwork_coding_system_alist;
 301
 302 #endif /* emacs */
 303
 304 Lisp_Object Qcoding_category_index;
 305
 306 /* List of symbols `coding-category-xxx' ordered by priority.  */
 307 Lisp_Object Vcoding_category_list;
 308
 309 /* Table of coding-systems currently assigned to each coding-category.  */
 310 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 311
 312 /* Table of names of symbol for each coding-category.  */
 313 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 314   "coding-category-emacs-mule",
 315   "coding-category-sjis",
 316   "coding-category-iso-7",
 317   "coding-category-iso-8-1",
 318   "coding-category-iso-8-2",
 319   "coding-category-iso-7-else",
 320   "coding-category-iso-8-else",
 321   "coding-category-big5",
 322   "coding-category-binary"
 323 };
 324
 325 /* Flag to tell if we look up unification table on character code
 326    conversion.  */
 327 Lisp_Object Venable_character_unification;
 328 /* Standard unification table to look up on decoding (reading).  */
 329 Lisp_Object Vstandard_character_unification_table_for_decode;
 330 /* Standard unification table to look up on encoding (writing).  */
 331 Lisp_Object Vstandard_character_unification_table_for_encode;
 332
 333 Lisp_Object Qcharacter_unification_table;
 334 Lisp_Object Qcharacter_unification_table_for_decode;
 335 Lisp_Object Qcharacter_unification_table_for_encode;
 336
 337 /* Alist of charsets vs revision number.  */
 338 Lisp_Object Vcharset_revision_alist;
 339
 340 /* Default coding systems used for process I/O.  */
 341 Lisp_Object Vdefault_process_coding_system;
 342
 343 \f
 344 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 345
 346 /* Emacs' internal format for encoding multiple character sets is a
 347    kind of multi-byte encoding, i.e. characters are encoded by
 348    variable-length sequences of one-byte codes.  ASCII characters
 349    and control characters (e.g. `tab', `newline') are represented by
 350    one-byte sequences which are their ASCII codes, in the range 0x00
 351    through 0x7F.  The other characters are represented by a sequence
 352    of `base leading-code', optional `extended leading-code', and one
 353    or two `position-code's.  The length of the sequence is determined
 354    by the base leading-code.  Leading-code takes the range 0x80
 355    through 0x9F, whereas extended leading-code and position-code take
 356    the range 0xA0 through 0xFF.  See `charset.h' for more details
 357    about leading-code and position-code.
 358
 359    There's one exception to this rule.  Special leading-code
 360    `leading-code-composition' denotes that the following several
 361    characters should be composed into one character.  Leading-codes of
 362    components (except for ASCII) are added 0x20.  An ASCII character
 363    component is represented by a 2-byte sequence of `0xA0' and
 364    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 365    details of composite character.  Hence, we can summarize the code
 366    range as follows:
 367
 368    --- CODE RANGE of Emacs' internal format ---
 369    (character set)      (range)
 370    ASCII                0x00 .. 0x7F
 371    ELSE (1st byte)      0x80 .. 0x9F
 372         (rest bytes)    0xA0 .. 0xFF
 373    ---------------------------------------------
 374
 375   */
 376
 377 enum emacs_code_class_type emacs_code_class[256];
 378
 379 /* Go to the next statement only if *SRC is accessible and the code is
 380    greater than 0xA0.  */
 381 #define CHECK_CODE_RANGE_A0_FF  \
 382   do {                          \
 383     if (src >= src_end)         \
 384       goto label_end_of_switch; \
 385     else if (*src++ < 0xA0)     \
 386       return 0;                 \
 387   } while (0)
 388
 389 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 390    Check if a text is encoded in Emacs' internal format.  If it is,
 391    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 392
 393 int
 394 detect_coding_emacs_mule (src, src_end)
 395      unsigned char *src, *src_end;
 396 {
 397   unsigned char c;
 398   int composing = 0;
 399
 400   while (src < src_end)
 401     {
 402       c = *src++;
 403
 404       if (composing)
 405         {
 406           if (c < 0xA0)
 407             composing = 0;
 408           else
 409             c -= 0x20;
 410         }
 411
 412       switch (emacs_code_class[c])
 413         {
 414         case EMACS_ascii_code:
 415         case EMACS_linefeed_code:
 416           break;
 417
 418         case EMACS_control_code:
 419           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 420             return 0;
 421           break;
 422
 423         case EMACS_invalid_code:
 424           return 0;
 425
 426         case EMACS_leading_code_composition: /* c == 0x80 */
 427           if (composing)
 428             CHECK_CODE_RANGE_A0_FF;
 429           else
 430             composing = 1;
 431           break;
 432
 433         case EMACS_leading_code_4:
 434           CHECK_CODE_RANGE_A0_FF;
 435           /* fall down to check it two more times ...  */
 436
 437         case EMACS_leading_code_3:
 438           CHECK_CODE_RANGE_A0_FF;
 439           /* fall down to check it one more time ...  */
 440
 441         case EMACS_leading_code_2:
 442           CHECK_CODE_RANGE_A0_FF;
 443           break;
 444
 445         default:
 446         label_end_of_switch:
 447           break;
 448         }
 449     }
 450   return CODING_CATEGORY_MASK_EMACS_MULE;
 451 }
 452
 453 \f
 454 /*** 3. ISO2022 handlers ***/
 455
 456 /* The following note describes the coding system ISO2022 briefly.
 457    Since the intention of this note is to help in understanding of
 458    the programs in this file, some parts are NOT ACCURATE or OVERLY
 459    SIMPLIFIED.  For the thorough understanding, please refer to the
 460    original document of ISO2022.
 461
 462    ISO2022 provides many mechanisms to encode several character sets
 463    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 464    all text is encoded by codes of less than 128.  This may make the
 465    encoded text a little bit longer, but the text gets more stability
 466    to pass through several gateways (some of them strip off the MSB).
 467
 468    There are two kinds of character set: control character set and
 469    graphic character set.  The former contains control characters such
 470    as `newline' and `escape' to provide control functions (control
 471    functions are provided also by escape sequences).  The latter
 472    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 473    two control character sets and many graphic character sets.
 474
 475    Graphic character sets are classified into one of the following
 476    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 477    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 478    bytes (DIMENSION) and the number of characters in one dimension
 479    (CHARS) of the set.  In addition, each character set is assigned an
 480    identification tag (called "final character" and denoted as <F>
 481    here after) which is unique in each class.  <F> of each character
 482    set is decided by ECMA(*) when it is registered in ISO.  Code range
 483    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 484
 485    Note (*): ECMA = European Computer Manufacturers Association
 486
 487    Here are examples of graphic character set [NAME(<F>)]:
 488         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 489         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 490         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 491         o DIMENSION2_CHARS96 -- none for the moment
 492
 493    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 494         C0 [0x00..0x1F] -- control character plane 0
 495         GL [0x20..0x7F] -- graphic character plane 0
 496         C1 [0x80..0x9F] -- control character plane 1
 497         GR [0xA0..0xFF] -- graphic character plane 1
 498
 499    A control character set is directly designated and invoked to C0 or
 500    C1 by an escape sequence.  The most common case is that ISO646's
 501    control character set is designated/invoked to C0 and ISO6429's
 502    control character set is designated/invoked to C1, and usually
 503    these designations/invocations are omitted in a coded text.  With
 504    7-bit environment, only C0 can be used, and a control character for
 505    C1 is encoded by an appropriate escape sequence to fit in the
 506    environment.  All control characters for C1 are defined the
 507    corresponding escape sequences.
 508
 509    A graphic character set is at first designated to one of four
 510    graphic registers (G0 through G3), then these graphic registers are
 511    invoked to GL or GR.  These designations and invocations can be
 512    done independently.  The most common case is that G0 is invoked to
 513    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 514    these invocations and designations are omitted in a coded text.
 515    With 7-bit environment, only GL can be used.
 516
 517    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 518    and 0x7F of GL area work as control characters SPACE and DEL
 519    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 520
 521    There are two ways of invocation: locking-shift and single-shift.
 522    With locking-shift, the invocation lasts until the next different
 523    invocation, whereas with single-shift, the invocation works only
 524    for the following character and doesn't affect locking-shift.
 525    Invocations are done by the following control characters or escape
 526    sequences.
 527
 528    ----------------------------------------------------------------------
 529    function             control char    escape sequence description
 530    ----------------------------------------------------------------------
 531    SI  (shift-in)               0x0F    none            invoke G0 to GL
 532    SO  (shift-out)              0x0E    none            invoke G1 to GL
 533    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 534    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 535    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 536    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 537    ----------------------------------------------------------------------
 538    The first four are for locking-shift.  Control characters for these
 539    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 540
 541    Designations are done by the following escape sequences.
 542    ----------------------------------------------------------------------
 543    escape sequence      description
 544    ----------------------------------------------------------------------
 545    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 546    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 547    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 548    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 549    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 550    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 551    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 552    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 553    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 554    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 555    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 556    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 557    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 558    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 559    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 560    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 561    ----------------------------------------------------------------------
 562
 563    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 564    of dimension 1, chars 94, and final character <F>, and etc.
 565
 566    Note (*): Although these designations are not allowed in ISO2022,
 567    Emacs accepts them on decoding, and produces them on encoding
 568    CHARS96 character set in a coding system which is characterized as
 569    7-bit environment, non-locking-shift, and non-single-shift.
 570
 571    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 572    '(' can be omitted.  We call this as "short-form" here after.
 573
 574    Now you may notice that there are a lot of ways for encoding the
 575    same multilingual text in ISO2022.  Actually, there exists many
 576    coding systems such as Compound Text (used in X's inter client
 577    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 578    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 579    localized platforms), and all of these are variants of ISO2022.
 580
 581    In addition to the above, Emacs handles two more kinds of escape
 582    sequences: ISO6429's direction specification and Emacs' private
 583    sequence for specifying character composition.
 584
 585    ISO6429's direction specification takes the following format:
 586         o CSI ']'      -- end of the current direction
 587         o CSI '0' ']'  -- end of the current direction
 588         o CSI '1' ']'  -- start of left-to-right text
 589         o CSI '2' ']'  -- start of right-to-left text
 590    The control character CSI (0x9B: control sequence introducer) is
 591    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 592
 593    Character composition specification takes the following format:
 594         o ESC '0' -- start character composition
 595         o ESC '1' -- end character composition
 596    Since these are not standard escape sequences of any ISO, the use
 597    of them for these meaning is restricted to Emacs only.  */
 598
 599 enum iso_code_class_type iso_code_class[256];
 600
 601 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 602    Check if a text is encoded in ISO2022.  If it is, returns an
 603    integer in which appropriate flag bits any of:
 604         CODING_CATEGORY_MASK_ISO_7
 605         CODING_CATEGORY_MASK_ISO_8_1
 606         CODING_CATEGORY_MASK_ISO_8_2
 607         CODING_CATEGORY_MASK_ISO_7_ELSE
 608         CODING_CATEGORY_MASK_ISO_8_ELSE
 609    are set.  If a code which should never appear in ISO2022 is found,
 610    returns 0.  */
 611
 612 int
 613 detect_coding_iso2022 (src, src_end)
 614      unsigned char *src, *src_end;
 615 {
 616   int mask = (CODING_CATEGORY_MASK_ISO_7
 617               | CODING_CATEGORY_MASK_ISO_8_1
 618               | CODING_CATEGORY_MASK_ISO_8_2
 619               | CODING_CATEGORY_MASK_ISO_7_ELSE
 620               | CODING_CATEGORY_MASK_ISO_8_ELSE
 621               );
 622   int g1 = 0;                   /* 1 iff designating to G1.  */
 623   int c, i;
 624   struct coding_system coding_iso_8_1, coding_iso_8_2;
 625
 626   /* Coding systems of these categories may accept latin extra codes.  */
 627   setup_coding_system
 628     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
 629      &coding_iso_8_1);
 630   setup_coding_system
 631     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
 632      &coding_iso_8_2);
 633
 634   while (mask && src < src_end)
 635     {
 636       c = *src++;
 637       switch (c)
 638         {
 639         case ISO_CODE_ESC:
 640           if (src >= src_end)
 641             break;
 642           c = *src++;
 643           if ((c >= '(' && c <= '/'))
 644             {
 645               /* Designation sequence for a charset of dimension 1.  */
 646               if (src >= src_end)
 647                 break;
 648               c = *src++;
 649               if (c < ' ' || c >= 0x80)
 650                 /* Invalid designation sequence.  */
 651                 return 0;
 652             }
 653           else if (c == '$')
 654             {
 655               /* Designation sequence for a charset of dimension 2.  */
 656               if (src >= src_end)
 657                 break;
 658               c = *src++;
 659               if (c >= '@' && c <= 'B')
 660                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 661                 ;
 662               else if (c >= '(' && c <= '/')
 663                 {
 664                   if (src >= src_end)
 665                     break;
 666                   c = *src++;
 667                   if (c < ' ' || c >= 0x80)
 668                     /* Invalid designation sequence.  */
 669                     return 0;
 670                 }
 671               else
 672                 /* Invalid designation sequence.  */
 673                 return 0;
 674             }
 675           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 676             /* Locking shift.  */
 677             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 678                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 679           else if (c == '0' || c == '1' || c == '2')
 680             /* Start/end composition.  */
 681             ;
 682           else
 683             /* Invalid escape sequence.  */
 684             return 0;
 685           break;
 686
 687         case ISO_CODE_SO:
 688           mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 689                    | CODING_CATEGORY_MASK_ISO_8_ELSE);
 690           break;
 691
 692         case ISO_CODE_CSI:
 693         case ISO_CODE_SS2:
 694         case ISO_CODE_SS3:
 695           {
 696             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 697
 698             if (VECTORP (Vlatin_extra_code_table)
 699                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 700               {
 701                 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 702                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 703                 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 704                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 705               }
 706             mask &= newmask;
 707           }
 708           break;
 709
 710         default:
 711           if (c < 0x80)
 712             break;
 713           else if (c < 0xA0)
 714             {
 715               if (VECTORP (Vlatin_extra_code_table)
 716                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 717                 {
 718                   int newmask = 0;
 719
 720                   if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 721                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 722                   if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 723                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 724                   mask &= newmask;
 725                 }
 726               else
 727                 return 0;
 728             }
 729           else
 730             {
 731               unsigned char *src_begin = src;
 732
 733               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 734                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 735               while (src < src_end && *src >= 0xA0)
 736                 src++;
 737               if ((src - src_begin - 1) & 1 && src < src_end)
 738                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 739             }
 740           break;
 741         }
 742     }
 743
 744   return mask;
 745 }
 746
 747 /* Decode a character of which charset is CHARSET and the 1st position
 748    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 749    fetched from SRC and set to C2.  If CHARSET is negative, it means
 750    that we are decoding ill formed text, and what we can do is just to
 751    read C1 as is.  */
 752
 753 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 754   do {                                                                  \
 755     int c_alt, charset_alt = (charset);                                 \
 756     if (COMPOSING_HEAD_P (coding->composing))                           \
 757       {                                                                 \
 758         *dst++ = LEADING_CODE_COMPOSITION;                              \
 759         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 760           /* To tell composition rules are embeded.  */                 \
 761           *dst++ = 0xFF;                                                \
 762         coding->composing += 2;                                         \
 763       }                                                                 \
 764     if ((charset) >= 0)                                                 \
 765       {                                                                 \
 766         if (CHARSET_DIMENSION (charset) == 2)                           \
 767           ONE_MORE_BYTE (c2);                                           \
 768         if (!NILP (unification_table)                                   \
 769             && ((c_alt = unify_char (unification_table,                 \
 770                                      -1, (charset), c1, c2)) >= 0))     \
 771           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 772       }                                                                 \
 773     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 774       DECODE_CHARACTER_ASCII (c1);                                      \
 775     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 776       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 777     else                                                                \
 778       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 779     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 780       /* To tell a composition rule follows.  */                        \
 781       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 782   } while (0)
 783
 784 /* Set designation state into CODING.  */
 785 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 786   do {                                                                  \
 787     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 788                                      make_number (chars),               \
 789                                      make_number (final_char));         \
 790     if (charset >= 0)                                                   \
 791       {                                                                 \
 792         if (coding->direction == 1                                      \
 793             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 794           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 795         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 796       }                                                                 \
 797   } while (0)
 798
 799 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 800
 801 int
 802 decode_coding_iso2022 (coding, source, destination,
 803                        src_bytes, dst_bytes, consumed)
 804      struct coding_system *coding;
 805      unsigned char *source, *destination;
 806      int src_bytes, dst_bytes;
 807      int *consumed;
 808 {
 809   unsigned char *src = source;
 810   unsigned char *src_end = source + src_bytes;
 811   unsigned char *dst = destination;
 812   unsigned char *dst_end = destination + dst_bytes;
 813   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 814      from DST_END to assure that overflow checking is necessary only
 815      at the head of loop.  */
 816   unsigned char *adjusted_dst_end = dst_end - 6;
 817   int charset;
 818   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 819   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 820   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 821   Lisp_Object unification_table
 822       = coding->character_unification_table_for_decode;
 823
 824   if (!NILP (Venable_character_unification) && NILP (unification_table))
 825     unification_table = Vstandard_character_unification_table_for_decode;
 826
 827   while (src < src_end && dst < adjusted_dst_end)
 828     {
 829       /* SRC_BASE remembers the start position in source in each loop.
 830          The loop will be exited when there's not enough source text
 831          to analyze long escape sequence or 2-byte code (within macros
 832          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 833          to SRC_BASE before exiting.  */
 834       unsigned char *src_base = src;
 835       int c1 = *src++, c2;
 836
 837       switch (iso_code_class [c1])
 838         {
 839         case ISO_0x20_or_0x7F:
 840           if (!coding->composing
 841               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 842             {
 843               /* This is SPACE or DEL.  */
 844               *dst++ = c1;
 845               break;
 846             }
 847           /* This is a graphic character, we fall down ...  */
 848
 849         case ISO_graphic_plane_0:
 850           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 851             {
 852               /* This is a composition rule.  */
 853               *dst++ = c1 | 0x80;
 854               coding->composing = COMPOSING_WITH_RULE_TAIL;
 855             }
 856           else
 857             DECODE_ISO_CHARACTER (charset0, c1);
 858           break;
 859
 860         case ISO_0xA0_or_0xFF:
 861           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 862             {
 863               /* Invalid code.  */
 864               *dst++ = c1;
 865               break;
 866             }
 867           /* This is a graphic character, we fall down ... */
 868
 869         case ISO_graphic_plane_1:
 870           DECODE_ISO_CHARACTER (charset1, c1);
 871           break;
 872
 873         case ISO_control_code:
 874           /* All ISO2022 control characters in this class have the
 875              same representation in Emacs internal format.  */
 876           *dst++ = c1;
 877           break;
 878
 879         case ISO_carriage_return:
 880           if (coding->eol_type == CODING_EOL_CR)
 881             {
 882               *dst++ = '\n';
 883             }
 884           else if (coding->eol_type == CODING_EOL_CRLF)
 885             {
 886               ONE_MORE_BYTE (c1);
 887               if (c1 == ISO_CODE_LF)
 888                 *dst++ = '\n';
 889               else
 890                 {
 891                   src--;
 892                   *dst++ = c1;
 893                 }
 894             }
 895           else
 896             {
 897               *dst++ = c1;
 898             }
 899           break;
 900
 901         case ISO_shift_out:
 902           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 903             goto label_invalid_escape_sequence;
 904           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 905           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 906           break;
 907
 908         case ISO_shift_in:
 909           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 910           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 911           break;
 912
 913         case ISO_single_shift_2_7:
 914         case ISO_single_shift_2:
 915           /* SS2 is handled as an escape sequence of ESC 'N' */
 916           c1 = 'N';
 917           goto label_escape_sequence;
 918
 919         case ISO_single_shift_3:
 920           /* SS2 is handled as an escape sequence of ESC 'O' */
 921           c1 = 'O';
 922           goto label_escape_sequence;
 923
 924         case ISO_control_sequence_introducer:
 925           /* CSI is handled as an escape sequence of ESC '[' ...  */
 926           c1 = '[';
 927           goto label_escape_sequence;
 928
 929         case ISO_escape:
 930           ONE_MORE_BYTE (c1);
 931         label_escape_sequence:
 932           /* Escape sequences handled by Emacs are invocation,
 933              designation, direction specification, and character
 934              composition specification.  */
 935           switch (c1)
 936             {
 937             case '&':           /* revision of following character set */
 938               ONE_MORE_BYTE (c1);
 939               if (!(c1 >= '@' && c1 <= '~'))
 940                 goto label_invalid_escape_sequence;
 941               ONE_MORE_BYTE (c1);
 942               if (c1 != ISO_CODE_ESC)
 943                 goto label_invalid_escape_sequence;
 944               ONE_MORE_BYTE (c1);
 945               goto label_escape_sequence;
 946
 947             case '$':           /* designation of 2-byte character set */
 948               ONE_MORE_BYTE (c1);
 949               if (c1 >= '@' && c1 <= 'B')
 950                 {       /* designation of JISX0208.1978, GB2312.1980,
 951                                    or JISX0208.1980 */
 952                   DECODE_DESIGNATION (0, 2, 94, c1);
 953                 }
 954               else if (c1 >= 0x28 && c1 <= 0x2B)
 955                 {       /* designation of DIMENSION2_CHARS94 character set */
 956                   ONE_MORE_BYTE (c2);
 957                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 958                 }
 959               else if (c1 >= 0x2C && c1 <= 0x2F)
 960                 {       /* designation of DIMENSION2_CHARS96 character set */
 961                   ONE_MORE_BYTE (c2);
 962                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 963                 }
 964               else
 965                 goto label_invalid_escape_sequence;
 966               break;
 967
 968             case 'n':           /* invocation of locking-shift-2 */
 969               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 970                 goto label_invalid_escape_sequence;
 971               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 972               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 973               break;
 974
 975             case 'o':           /* invocation of locking-shift-3 */
 976               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 977                 goto label_invalid_escape_sequence;
 978               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 979               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 980               break;
 981
 982             case 'N':           /* invocation of single-shift-2 */
 983               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 984                 goto label_invalid_escape_sequence;
 985               ONE_MORE_BYTE (c1);
 986               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 987               DECODE_ISO_CHARACTER (charset, c1);
 988               break;
 989
 990             case 'O':           /* invocation of single-shift-3 */
 991               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 992                 goto label_invalid_escape_sequence;
 993               ONE_MORE_BYTE (c1);
 994               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 995               DECODE_ISO_CHARACTER (charset, c1);
 996               break;
 997
 998             case '0':           /* start composing without embeded rules */
 999               coding->composing = COMPOSING_NO_RULE_HEAD;
1000               break;
1001
1002             case '1':           /* end composing */
1003               coding->composing = COMPOSING_NO;
1004               break;
1005
1006             case '2':           /* start composing with embeded rules */
1007               coding->composing = COMPOSING_WITH_RULE_HEAD;
1008               break;
1009
1010             case '[':           /* specification of direction */
1011               /* For the moment, nested direction is not supported.
1012                  So, the value of `coding->direction' is 0 or 1: 0
1013                  means left-to-right, 1 means right-to-left.  */
1014               ONE_MORE_BYTE (c1);
1015               switch (c1)
1016                 {
1017                 case ']':       /* end of the current direction */
1018                   coding->direction = 0;
1019
1020                 case '0':       /* end of the current direction */
1021                 case '1':       /* start of left-to-right direction */
1022                   ONE_MORE_BYTE (c1);
1023                   if (c1 == ']')
1024                     coding->direction = 0;
1025                   else
1026                     goto label_invalid_escape_sequence;
1027                   break;
1028
1029                 case '2':       /* start of right-to-left direction */
1030                   ONE_MORE_BYTE (c1);
1031                   if (c1 == ']')
1032                     coding->direction= 1;
1033                   else
1034                     goto label_invalid_escape_sequence;
1035                   break;
1036
1037                 default:
1038                   goto label_invalid_escape_sequence;
1039                 }
1040               break;
1041
1042             default:
1043               if (c1 >= 0x28 && c1 <= 0x2B)
1044                 {       /* designation of DIMENSION1_CHARS94 character set */
1045                   ONE_MORE_BYTE (c2);
1046                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1047                 }
1048               else if (c1 >= 0x2C && c1 <= 0x2F)
1049                 {       /* designation of DIMENSION1_CHARS96 character set */
1050                   ONE_MORE_BYTE (c2);
1051                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1052                 }
1053               else
1054                 {
1055                   goto label_invalid_escape_sequence;
1056                 }
1057             }
1058           /* We must update these variables now.  */
1059           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1060           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1061           break;
1062
1063         label_invalid_escape_sequence:
1064           {
1065             int length = src - src_base;
1066
1067             bcopy (src_base, dst, length);
1068             dst += length;
1069           }
1070         }
1071       continue;
1072
1073     label_end_of_loop:
1074       coding->carryover_size = src - src_base;
1075       bcopy (src_base, coding->carryover, coding->carryover_size);
1076       src = src_base;
1077       break;
1078     }
1079
1080   /* If this is the last block of the text to be decoded, we had
1081      better just flush out all remaining codes in the text although
1082      they are not valid characters.  */
1083   if (coding->last_block)
1084     {
1085       bcopy (src, dst, src_end - src);
1086       dst += (src_end - src);
1087       src = src_end;
1088     }
1089   *consumed = src - source;
1090   return dst - destination;
1091 }
1092
1093 /* ISO2022 encoding stuff.  */
1094
1095 /*
1096    It is not enough to say just "ISO2022" on encoding, we have to
1097    specify more details.  In Emacs, each coding-system of ISO2022
1098    variant has the following specifications:
1099         1. Initial designation to G0 thru G3.
1100         2. Allows short-form designation?
1101         3. ASCII should be designated to G0 before control characters?
1102         4. ASCII should be designated to G0 at end of line?
1103         5. 7-bit environment or 8-bit environment?
1104         6. Use locking-shift?
1105         7. Use Single-shift?
1106    And the following two are only for Japanese:
1107         8. Use ASCII in place of JIS0201-1976-Roman?
1108         9. Use JISX0208-1983 in place of JISX0208-1978?
1109    These specifications are encoded in `coding->flags' as flag bits
1110    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1111    details.
1112 */
1113
1114 /* Produce codes (escape sequence) for designating CHARSET to graphic
1115    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1116    the coding system CODING allows, produce designation sequence of
1117    short-form.  */
1118
1119 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1120   do {                                                                  \
1121     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1122     char *intermediate_char_94 = "()*+";                                \
1123     char *intermediate_char_96 = ",-./";                                \
1124     Lisp_Object temp                                                    \
1125       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1126     if (! NILP (temp))                                                  \
1127         {                                                               \
1128         *dst++ = ISO_CODE_ESC;                                          \
1129         *dst++ = '&';                                                   \
1130         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1131       }                                                                 \
1132     *dst++ = ISO_CODE_ESC;                                              \
1133     if (CHARSET_DIMENSION (charset) == 1)                               \
1134       {                                                                 \
1135         if (CHARSET_CHARS (charset) == 94)                              \
1136           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1137         else                                                            \
1138           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1139       }                                                                 \
1140     else                                                                \
1141       {                                                                 \
1142         *dst++ = '$';                                                   \
1143         if (CHARSET_CHARS (charset) == 94)                              \
1144           {                                                             \
1145             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1146                 || reg != 0                                             \
1147                 || final_char < '@' || final_char > 'B')                \
1148               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1149           }                                                             \
1150         else                                                            \
1151           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1152       }                                                                 \
1153     *dst++ = final_char;                                                \
1154     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1155   } while (0)
1156
1157 /* The following two macros produce codes (control character or escape
1158    sequence) for ISO2022 single-shift functions (single-shift-2 and
1159    single-shift-3).  */
1160
1161 #define ENCODE_SINGLE_SHIFT_2                           \
1162   do {                                                  \
1163     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1164       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1165     else                                                \
1166       *dst++ = ISO_CODE_SS2;                            \
1167     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1168   } while (0)
1169
1170 #define ENCODE_SINGLE_SHIFT_3                           \
1171   do {                                                  \
1172     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1173       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1174     else                                                \
1175       *dst++ = ISO_CODE_SS3;                            \
1176     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1177   } while (0)
1178
1179 /* The following four macros produce codes (control character or
1180    escape sequence) for ISO2022 locking-shift functions (shift-in,
1181    shift-out, locking-shift-2, and locking-shift-3).  */
1182
1183 #define ENCODE_SHIFT_IN                         \
1184   do {                                          \
1185     *dst++ = ISO_CODE_SI;                       \
1186     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1187   } while (0)
1188
1189 #define ENCODE_SHIFT_OUT                        \
1190   do {                                          \
1191     *dst++ = ISO_CODE_SO;                       \
1192     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1193   } while (0)
1194
1195 #define ENCODE_LOCKING_SHIFT_2                  \
1196   do {                                          \
1197     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1198     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1199   } while (0)
1200
1201 #define ENCODE_LOCKING_SHIFT_3                  \
1202   do {                                          \
1203     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1204     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1205   } while (0)
1206
1207 /* Produce codes for a DIMENSION1 character whose character set is
1208    CHARSET and whose position-code is C1.  Designation and invocation
1209    sequences are also produced in advance if necessary.  */
1210
1211
1212 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1213   do {                                                                  \
1214     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1215       {                                                                 \
1216         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1217           *dst++ = c1 & 0x7F;                                           \
1218         else                                                            \
1219           *dst++ = c1 | 0x80;                                           \
1220         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1221         break;                                                          \
1222       }                                                                 \
1223     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1224       {                                                                 \
1225         *dst++ = c1 & 0x7F;                                             \
1226         break;                                                          \
1227       }                                                                 \
1228     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1229       {                                                                 \
1230         *dst++ = c1 | 0x80;                                             \
1231         break;                                                          \
1232       }                                                                 \
1233     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1234              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1235       {                                                                 \
1236         /* We should not encode this character, instead produce one or  \
1237            two `?'s.  */                                                \
1238         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1239         if (CHARSET_WIDTH (charset) == 2)                               \
1240           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1241         break;                                                          \
1242       }                                                                 \
1243     else                                                                \
1244       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1245          must invoke it, or, at first, designate it to some graphic     \
1246          register.  Then repeat the loop to actually produce the        \
1247          character.  */                                                 \
1248       dst = encode_invocation_designation (charset, coding, dst);       \
1249   } while (1)
1250
1251 /* Produce codes for a DIMENSION2 character whose character set is
1252    CHARSET and whose position-codes are C1 and C2.  Designation and
1253    invocation codes are also produced in advance if necessary.  */
1254
1255 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1256   do {                                                                  \
1257     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1258       {                                                                 \
1259         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1260           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1261         else                                                            \
1262           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1263         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1264         break;                                                          \
1265       }                                                                 \
1266     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1267       {                                                                 \
1268         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1269         break;                                                          \
1270       }                                                                 \
1271     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1272       {                                                                 \
1273         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1274         break;                                                          \
1275       }                                                                 \
1276     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1277              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1278       {                                                                 \
1279         /* We should not encode this character, instead produce one or  \
1280            two `?'s.  */                                                \
1281         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1282         if (CHARSET_WIDTH (charset) == 2)                               \
1283           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1284         break;                                                          \
1285       }                                                                 \
1286     else                                                                \
1287       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1288          must invoke it, or, at first, designate it to some graphic     \
1289          register.  Then repeat the loop to actually produce the        \
1290          character.  */                                                 \
1291       dst = encode_invocation_designation (charset, coding, dst);       \
1292   } while (1)
1293
1294 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1295   do {                                                                    \
1296     int c_alt, charset_alt;                                               \
1297     if (!NILP (unification_table)                                         \
1298         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1299             >= 0))                                                        \
1300       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1301     else                                                                  \
1302       charset_alt = charset;                                              \
1303     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1304       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1305     else                                                                  \
1306       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1307   } while (0)
1308
1309 /* Produce designation and invocation codes at a place pointed by DST
1310    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1311    Return new DST.  */
1312
1313 unsigned char *
1314 encode_invocation_designation (charset, coding, dst)
1315      int charset;
1316      struct coding_system *coding;
1317      unsigned char *dst;
1318 {
1319   int reg;                      /* graphic register number */
1320
1321   /* At first, check designations.  */
1322   for (reg = 0; reg < 4; reg++)
1323     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1324       break;
1325
1326   if (reg >= 4)
1327     {
1328       /* CHARSET is not yet designated to any graphic registers.  */
1329       /* At first check the requested designation.  */
1330       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1331       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1332         /* Since CHARSET requests no special designation, designate it
1333            to graphic register 0.  */
1334         reg = 0;
1335
1336       ENCODE_DESIGNATION (charset, reg, coding);
1337     }
1338
1339   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1340       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1341     {
1342       /* Since the graphic register REG is not invoked to any graphic
1343          planes, invoke it to graphic plane 0.  */
1344       switch (reg)
1345         {
1346         case 0:                 /* graphic register 0 */
1347           ENCODE_SHIFT_IN;
1348           break;
1349
1350         case 1:                 /* graphic register 1 */
1351           ENCODE_SHIFT_OUT;
1352           break;
1353
1354         case 2:                 /* graphic register 2 */
1355           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1356             ENCODE_SINGLE_SHIFT_2;
1357           else
1358             ENCODE_LOCKING_SHIFT_2;
1359           break;
1360
1361         case 3:                 /* graphic register 3 */
1362           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1363             ENCODE_SINGLE_SHIFT_3;
1364           else
1365             ENCODE_LOCKING_SHIFT_3;
1366           break;
1367         }
1368     }
1369   return dst;
1370 }
1371
1372 /* The following two macros produce codes for indicating composition.  */
1373 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1374 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1375 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1376
1377 /* The following three macros produce codes for indicating direction
1378    of text.  */
1379 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1380   do {                                                  \
1381     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1382       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1383     else                                                \
1384       *dst++ = ISO_CODE_CSI;                            \
1385   } while (0)
1386
1387 #define ENCODE_DIRECTION_R2L    \
1388   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1389
1390 #define ENCODE_DIRECTION_L2R    \
1391   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1392
1393 /* Produce codes for designation and invocation to reset the graphic
1394    planes and registers to initial state.  */
1395 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1396   do {                                                                      \
1397     int reg;                                                                \
1398     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1399       ENCODE_SHIFT_IN;                                                      \
1400     for (reg = 0; reg < 4; reg++)                                           \
1401       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1402           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1403               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1404         ENCODE_DESIGNATION                                                  \
1405           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1406   } while (0)
1407
1408 /* Produce designation sequences of charsets in the line started from
1409    *SRC to a place pointed by DSTP.
1410
1411    If the current block ends before any end-of-line, we may fail to
1412    find all the necessary *designations.  */
1413 encode_designation_at_bol (coding, table, src, src_end, dstp)
1414      struct coding_system *coding;
1415      Lisp_Object table;
1416      unsigned char *src, *src_end, **dstp;
1417 {
1418   int charset, c, found = 0, reg;
1419   /* Table of charsets to be designated to each graphic register.  */
1420   int r[4];
1421   unsigned char *dst = *dstp;
1422
1423   for (reg = 0; reg < 4; reg++)
1424     r[reg] = -1;
1425
1426   while (src < src_end && *src != '\n' && found < 4)
1427     {
1428       int bytes = BYTES_BY_CHAR_HEAD (*src);
1429
1430       if (NILP (table))
1431         charset = CHARSET_AT (src);
1432       else
1433         {
1434           int c_alt, c1, c2;
1435
1436           SPLIT_STRING(src, bytes, charset, c1, c2);
1437           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1438             charset = CHAR_CHARSET (c_alt);
1439         }
1440
1441       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1442       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1443         {
1444           found++;
1445           r[reg] = charset;
1446         }
1447
1448       src += bytes;
1449     }
1450
1451   if (found)
1452     {
1453       for (reg = 0; reg < 4; reg++)
1454         if (r[reg] >= 0
1455             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1456           ENCODE_DESIGNATION (r[reg], reg, coding);
1457       *dstp = dst;
1458     }
1459 }
1460
1461 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1462
1463 int
1464 encode_coding_iso2022 (coding, source, destination,
1465                        src_bytes, dst_bytes, consumed)
1466      struct coding_system *coding;
1467      unsigned char *source, *destination;
1468      int src_bytes, dst_bytes;
1469      int *consumed;
1470 {
1471   unsigned char *src = source;
1472   unsigned char *src_end = source + src_bytes;
1473   unsigned char *dst = destination;
1474   unsigned char *dst_end = destination + dst_bytes;
1475   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1476      from DST_END to assure overflow checking is necessary only at the
1477      head of loop.  */
1478   unsigned char *adjusted_dst_end = dst_end - 19;
1479   Lisp_Object unification_table
1480       = coding->character_unification_table_for_encode;
1481
1482   if (!NILP (Venable_character_unification) && NILP (unification_table))
1483     unification_table = Vstandard_character_unification_table_for_encode;
1484
1485   while (src < src_end && dst < adjusted_dst_end)
1486     {
1487       /* SRC_BASE remembers the start position in source in each loop.
1488          The loop will be exited when there's not enough source text
1489          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1490          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1491          reset to SRC_BASE before exiting.  */
1492       unsigned char *src_base = src;
1493       int charset, c1, c2, c3, c4;
1494
1495       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1496           && CODING_SPEC_ISO_BOL (coding))
1497         {
1498           /* We have to produce designation sequences if any now.  */
1499           encode_designation_at_bol (coding, unification_table,
1500                                      src, src_end, &dst);
1501           CODING_SPEC_ISO_BOL (coding) = 0;
1502         }
1503
1504       c1 = *src++;
1505       /* If we are seeing a component of a composite character, we are
1506          seeing a leading-code specially encoded for composition, or a
1507          composition rule if composing with rule.  We must set C1
1508          to a normal leading-code or an ASCII code.  If we are not at
1509          a composed character, we must reset the composition state.  */
1510       if (COMPOSING_P (coding->composing))
1511         {
1512           if (c1 < 0xA0)
1513             {
1514               /* We are not in a composite character any longer.  */
1515               coding->composing = COMPOSING_NO;
1516               ENCODE_COMPOSITION_END;
1517             }
1518           else
1519             {
1520               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1521                 {
1522                   *dst++ = c1 & 0x7F;
1523                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1524                   continue;
1525                 }
1526               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1527                 coding->composing = COMPOSING_WITH_RULE_RULE;
1528               if (c1 == 0xA0)
1529                 {
1530                   /* This is an ASCII component.  */
1531                   ONE_MORE_BYTE (c1);
1532                   c1 &= 0x7F;
1533                 }
1534               else
1535                 /* This is a leading-code of non ASCII component.  */
1536                 c1 -= 0x20;
1537             }
1538         }
1539
1540       /* Now encode one character.  C1 is a control character, an
1541          ASCII character, or a leading-code of multi-byte character.  */
1542       switch (emacs_code_class[c1])
1543         {
1544         case EMACS_ascii_code:
1545           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1546           break;
1547
1548         case EMACS_control_code:
1549           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1550             ENCODE_RESET_PLANE_AND_REGISTER;
1551           *dst++ = c1;
1552           break;
1553
1554         case EMACS_carriage_return_code:
1555           if (!coding->selective)
1556             {
1557               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1558                 ENCODE_RESET_PLANE_AND_REGISTER;
1559               *dst++ = c1;
1560               break;
1561             }
1562           /* fall down to treat '\r' as '\n' ...  */
1563
1564         case EMACS_linefeed_code:
1565           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1566             ENCODE_RESET_PLANE_AND_REGISTER;
1567           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1568             bcopy (coding->spec.iso2022.initial_designation,
1569                    coding->spec.iso2022.current_designation,
1570                    sizeof coding->spec.iso2022.initial_designation);
1571           if (coding->eol_type == CODING_EOL_LF
1572               || coding->eol_type == CODING_EOL_UNDECIDED)
1573             *dst++ = ISO_CODE_LF;
1574           else if (coding->eol_type == CODING_EOL_CRLF)
1575             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1576           else
1577             *dst++ = ISO_CODE_CR;
1578           CODING_SPEC_ISO_BOL (coding) = 1;
1579           break;
1580
1581         case EMACS_leading_code_2:
1582           ONE_MORE_BYTE (c2);
1583           if (c2 < 0xA0)
1584             {
1585               /* invalid sequence */
1586               *dst++ = c1;
1587               *dst++ = c2;
1588             }
1589           else
1590             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1591           break;
1592
1593         case EMACS_leading_code_3:
1594           TWO_MORE_BYTES (c2, c3);
1595           if (c2 < 0xA0 || c3 < 0xA0)
1596             {
1597               /* invalid sequence */
1598               *dst++ = c1;
1599               *dst++ = c2;
1600               *dst++ = c3;
1601             }
1602           else if (c1 < LEADING_CODE_PRIVATE_11)
1603             ENCODE_ISO_CHARACTER (c1, c2, c3);
1604           else
1605             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1606           break;
1607
1608         case EMACS_leading_code_4:
1609           THREE_MORE_BYTES (c2, c3, c4);
1610           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1611             {
1612               /* invalid sequence */
1613               *dst++ = c1;
1614               *dst++ = c2;
1615               *dst++ = c3;
1616               *dst++ = c4;
1617             }
1618           else
1619             ENCODE_ISO_CHARACTER (c2, c3, c4);
1620           break;
1621
1622         case EMACS_leading_code_composition:
1623           ONE_MORE_BYTE (c2);
1624           if (c2 < 0xA0)
1625             {
1626               /* invalid sequence */
1627               *dst++ = c1;
1628               *dst++ = c2;
1629             }
1630           else if (c2 == 0xFF)
1631             {
1632               coding->composing = COMPOSING_WITH_RULE_HEAD;
1633               ENCODE_COMPOSITION_WITH_RULE_START;
1634             }
1635           else
1636             {
1637               /* Rewind one byte because it is a character code of
1638                  composition elements.  */
1639               src--;
1640               coding->composing = COMPOSING_NO_RULE_HEAD;
1641               ENCODE_COMPOSITION_NO_RULE_START;
1642             }
1643           break;
1644
1645         case EMACS_invalid_code:
1646           *dst++ = c1;
1647           break;
1648         }
1649       continue;
1650     label_end_of_loop:
1651       /* We reach here because the source date ends not at character
1652          boundary.  */
1653       coding->carryover_size = src_end - src_base;
1654       bcopy (src_base, coding->carryover, coding->carryover_size);
1655       src = src_end;
1656       break;
1657     }
1658
1659   /* If this is the last block of the text to be encoded, we must
1660      reset graphic planes and registers to the initial state.  */
1661   if (src >= src_end && coding->last_block)
1662     {
1663       ENCODE_RESET_PLANE_AND_REGISTER;
1664       if (coding->carryover_size > 0
1665           && coding->carryover_size < (dst_end - dst))
1666         {
1667           bcopy (coding->carryover, dst, coding->carryover_size);
1668           dst += coding->carryover_size;
1669           coding->carryover_size = 0;
1670         }
1671     }
1672   *consumed = src - source;
1673   return dst - destination;
1674 }
1675
1676 \f
1677 /*** 4. SJIS and BIG5 handlers ***/
1678
1679 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1680    quite widely.  So, for the moment, Emacs supports them in the bare
1681    C code.  But, in the future, they may be supported only by CCL.  */
1682
1683 /* SJIS is a coding system encoding three character sets: ASCII, right
1684    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1685    as is.  A character of charset katakana-jisx0201 is encoded by
1686    "position-code + 0x80".  A character of charset japanese-jisx0208
1687    is encoded in 2-byte but two position-codes are divided and shifted
1688    so that it fit in the range below.
1689
1690    --- CODE RANGE of SJIS ---
1691    (character set)      (range)
1692    ASCII                0x00 .. 0x7F
1693    KATAKANA-JISX0201    0xA0 .. 0xDF
1694    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1695             (2nd byte)  0x40 .. 0xFF
1696    -------------------------------
1697
1698 */
1699
1700 /* BIG5 is a coding system encoding two character sets: ASCII and
1701    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1702    character set and is encoded in two-byte.
1703
1704    --- CODE RANGE of BIG5 ---
1705    (character set)      (range)
1706    ASCII                0x00 .. 0x7F
1707    Big5 (1st byte)      0xA1 .. 0xFE
1708         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1709    --------------------------
1710
1711    Since the number of characters in Big5 is larger than maximum
1712    characters in Emacs' charset (96x96), it can't be handled as one
1713    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1714    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1715    contains frequently used characters and the latter contains less
1716    frequently used characters.  */
1717
1718 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1719    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1720    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1721    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1722
1723 /* Number of Big5 characters which have the same code in 1st byte.  */
1724 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1725
1726 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1727   do {                                                                  \
1728     unsigned int temp                                                   \
1729       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1730     if (b1 < 0xC9)                                                      \
1731       charset = charset_big5_1;                                         \
1732     else                                                                \
1733       {                                                                 \
1734         charset = charset_big5_2;                                       \
1735         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1736       }                                                                 \
1737     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1738     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1739   } while (0)
1740
1741 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1742   do {                                                                  \
1743     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1744     if (charset == charset_big5_2)                                      \
1745       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1746     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1747     b2 = temp % BIG5_SAME_ROW;                                          \
1748     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1749   } while (0)
1750
1751 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1752   do {                                                                  \
1753     int c_alt, charset_alt = (charset);                                 \
1754     if (!NILP (unification_table)                                       \
1755         && ((c_alt = unify_char (unification_table,                     \
1756                                  -1, (charset), c1, c2)) >= 0))         \
1757           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1758     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1759       DECODE_CHARACTER_ASCII (c1);                                      \
1760     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1761       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1762     else                                                                \
1763       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1764   } while (0)
1765
1766 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1767   do {                                                                    \
1768     int c_alt, charset_alt;                                               \
1769     if (!NILP (unification_table)                                         \
1770         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1771             >= 0))                                                        \
1772       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1773     else                                                                  \
1774       charset_alt = charset;                                              \
1775     if (charset_alt == charset_ascii)                                     \
1776       *dst++ = c1;                                                        \
1777     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1778       {                                                                   \
1779         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1780           *dst++ = c1;                                                    \
1781         else                                                              \
1782           *dst++ = charset_alt, *dst++ = c1;                              \
1783       }                                                                   \
1784     else                                                                  \
1785       {                                                                   \
1786         c1 &= 0x7F, c2 &= 0x7F;                                           \
1787         if (sjis_p && charset_alt == charset_jisx0208)                    \
1788           {                                                               \
1789             unsigned char s1, s2;                                         \
1790                                                                           \
1791             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1792             *dst++ = s1, *dst++ = s2;                                     \
1793           }                                                               \
1794         else if (!sjis_p                                                  \
1795                  && (charset_alt == charset_big5_1                        \
1796                      || charset_alt == charset_big5_2))                   \
1797           {                                                               \
1798             unsigned char b1, b2;                                         \
1799                                                                           \
1800             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1801             *dst++ = b1, *dst++ = b2;                                     \
1802           }                                                               \
1803         else                                                              \
1804           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1805       }                                                                   \
1806   } while (0);
1807
1808 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1809    Check if a text is encoded in SJIS.  If it is, return
1810    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1811
1812 int
1813 detect_coding_sjis (src, src_end)
1814      unsigned char *src, *src_end;
1815 {
1816   unsigned char c;
1817
1818   while (src < src_end)
1819     {
1820       c = *src++;
1821       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1822         return 0;
1823       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1824         {
1825           if (src < src_end && *src++ < 0x40)
1826             return 0;
1827         }
1828     }
1829   return CODING_CATEGORY_MASK_SJIS;
1830 }
1831
1832 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1833    Check if a text is encoded in BIG5.  If it is, return
1834    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1835
1836 int
1837 detect_coding_big5 (src, src_end)
1838      unsigned char *src, *src_end;
1839 {
1840   unsigned char c;
1841
1842   while (src < src_end)
1843     {
1844       c = *src++;
1845       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1846         return 0;
1847       if (c >= 0xA1)
1848         {
1849           if (src >= src_end)
1850             break;
1851           c = *src++;
1852           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1853             return 0;
1854         }
1855     }
1856   return CODING_CATEGORY_MASK_BIG5;
1857 }
1858
1859 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1860    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1861
1862 int
1863 decode_coding_sjis_big5 (coding, source, destination,
1864                          src_bytes, dst_bytes, consumed, sjis_p)
1865      struct coding_system *coding;
1866      unsigned char *source, *destination;
1867      int src_bytes, dst_bytes;
1868      int *consumed;
1869      int sjis_p;
1870 {
1871   unsigned char *src = source;
1872   unsigned char *src_end = source + src_bytes;
1873   unsigned char *dst = destination;
1874   unsigned char *dst_end = destination + dst_bytes;
1875   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1876      from DST_END to assure overflow checking is necessary only at the
1877      head of loop.  */
1878   unsigned char *adjusted_dst_end = dst_end - 3;
1879   Lisp_Object unification_table
1880       = coding->character_unification_table_for_decode;
1881
1882   if (!NILP (Venable_character_unification) && NILP (unification_table))
1883     unification_table = Vstandard_character_unification_table_for_decode;
1884
1885   while (src < src_end && dst < adjusted_dst_end)
1886     {
1887       /* SRC_BASE remembers the start position in source in each loop.
1888          The loop will be exited when there's not enough source text
1889          to analyze two-byte character (within macro ONE_MORE_BYTE).
1890          In that case, SRC is reset to SRC_BASE before exiting.  */
1891       unsigned char *src_base = src;
1892       unsigned char c1 = *src++, c2, c3, c4;
1893
1894       if (c1 == '\r')
1895         {
1896           if (coding->eol_type == CODING_EOL_CRLF)
1897             {
1898               ONE_MORE_BYTE (c2);
1899               if (c2 == '\n')
1900                 *dst++ = c2;
1901               else
1902                 /* To process C2 again, SRC is subtracted by 1.  */
1903                 *dst++ = c1, src--;
1904             }
1905           else
1906             *dst++ = c1;
1907         }
1908       else if (c1 < 0x20)
1909         *dst++ = c1;
1910       else if (c1 < 0x80)
1911         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1912       else if (c1 < 0xA0 || c1 >= 0xE0)
1913         {
1914           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1915           if (sjis_p)
1916             {
1917               ONE_MORE_BYTE (c2);
1918               DECODE_SJIS (c1, c2, c3, c4);
1919               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1920             }
1921           else if (c1 >= 0xE0 && c1 < 0xFF)
1922             {
1923               int charset;
1924
1925               ONE_MORE_BYTE (c2);
1926               DECODE_BIG5 (c1, c2, charset, c3, c4);
1927               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1928             }
1929           else                  /* Invalid code */
1930             *dst++ = c1;
1931         }
1932       else
1933         {
1934           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1935           if (sjis_p)
1936             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1937           else
1938             {
1939               int charset;
1940
1941               ONE_MORE_BYTE (c2);
1942               DECODE_BIG5 (c1, c2, charset, c3, c4);
1943               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1944             }
1945         }
1946       continue;
1947
1948     label_end_of_loop:
1949       coding->carryover_size = src - src_base;
1950       bcopy (src_base, coding->carryover, coding->carryover_size);
1951       src = src_base;
1952       break;
1953     }
1954
1955   *consumed = src - source;
1956   return dst - destination;
1957 }
1958
1959 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1960    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1961    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1962    sure that all these charsets are registered as official charset
1963    (i.e. do not have extended leading-codes).  Characters of other
1964    charsets are produced without any encoding.  If SJIS_P is 1, encode
1965    SJIS text, else encode BIG5 text.  */
1966
1967 int
1968 encode_coding_sjis_big5 (coding, source, destination,
1969                          src_bytes, dst_bytes, consumed, sjis_p)
1970      struct coding_system *coding;
1971      unsigned char *source, *destination;
1972      int src_bytes, dst_bytes;
1973      int *consumed;
1974      int sjis_p;
1975 {
1976   unsigned char *src = source;
1977   unsigned char *src_end = source + src_bytes;
1978   unsigned char *dst = destination;
1979   unsigned char *dst_end = destination + dst_bytes;
1980   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1981      from DST_END to assure overflow checking is necessary only at the
1982      head of loop.  */
1983   unsigned char *adjusted_dst_end = dst_end - 1;
1984   Lisp_Object unification_table
1985       = coding->character_unification_table_for_encode;
1986
1987   if (!NILP (Venable_character_unification) && NILP (unification_table))
1988     unification_table = Vstandard_character_unification_table_for_encode;
1989
1990   while (src < src_end && dst < adjusted_dst_end)
1991     {
1992       /* SRC_BASE remembers the start position in source in each loop.
1993          The loop will be exited when there's not enough source text
1994          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1995          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1996          before exiting.  */
1997       unsigned char *src_base = src;
1998       unsigned char c1 = *src++, c2, c3, c4;
1999
2000       if (coding->composing)
2001         {
2002           if (c1 == 0xA0)
2003             {
2004               ONE_MORE_BYTE (c1);
2005               c1 &= 0x7F;
2006             }
2007           else if (c1 >= 0xA0)
2008             c1 -= 0x20;
2009           else
2010             coding->composing = 0;
2011         }
2012
2013       switch (emacs_code_class[c1])
2014         {
2015         case EMACS_ascii_code:
2016           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2017           break;
2018
2019         case EMACS_control_code:
2020           *dst++ = c1;
2021           break;
2022
2023         case EMACS_carriage_return_code:
2024           if (!coding->selective)
2025             {
2026               *dst++ = c1;
2027               break;
2028             }
2029           /* fall down to treat '\r' as '\n' ...  */
2030
2031         case EMACS_linefeed_code:
2032           if (coding->eol_type == CODING_EOL_LF
2033               || coding->eol_type == CODING_EOL_UNDECIDED)
2034             *dst++ = '\n';
2035           else if (coding->eol_type == CODING_EOL_CRLF)
2036             *dst++ = '\r', *dst++ = '\n';
2037           else
2038             *dst++ = '\r';
2039           break;
2040
2041         case EMACS_leading_code_2:
2042           ONE_MORE_BYTE (c2);
2043           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2044           break;
2045
2046         case EMACS_leading_code_3:
2047           TWO_MORE_BYTES (c2, c3);
2048           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2049           break;
2050
2051         case EMACS_leading_code_4:
2052           THREE_MORE_BYTES (c2, c3, c4);
2053           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2054           break;
2055
2056         case EMACS_leading_code_composition:
2057           coding->composing = 1;
2058           break;
2059
2060         default:                /* i.e. case EMACS_invalid_code: */
2061           *dst++ = c1;
2062         }
2063       continue;
2064
2065     label_end_of_loop:
2066       coding->carryover_size = src_end - src_base;
2067       bcopy (src_base, coding->carryover, coding->carryover_size);
2068       src = src_end;
2069       break;
2070     }
2071
2072   *consumed = src - source;
2073   return dst - destination;
2074 }
2075
2076 \f
2077 /*** 5. End-of-line handlers ***/
2078
2079 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2080    This function is called only when `coding->eol_type' is
2081    CODING_EOL_CRLF or CODING_EOL_CR.  */
2082
2083 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2084      struct coding_system *coding;
2085      unsigned char *source, *destination;
2086      int src_bytes, dst_bytes;
2087      int *consumed;
2088 {
2089   unsigned char *src = source;
2090   unsigned char *src_end = source + src_bytes;
2091   unsigned char *dst = destination;
2092   unsigned char *dst_end = destination + dst_bytes;
2093   int produced;
2094
2095   switch (coding->eol_type)
2096     {
2097     case CODING_EOL_CRLF:
2098       {
2099         /* Since the maximum bytes produced by each loop is 2, we
2100            subtract 1 from DST_END to assure overflow checking is
2101            necessary only at the head of loop.  */
2102         unsigned char *adjusted_dst_end = dst_end - 1;
2103
2104         while (src < src_end && dst < adjusted_dst_end)
2105           {
2106             unsigned char *src_base = src;
2107             unsigned char c = *src++;
2108             if (c == '\r')
2109               {
2110                 ONE_MORE_BYTE (c);
2111                 if (c != '\n')
2112                   *dst++ = '\r';
2113                 *dst++ = c;
2114               }
2115             else
2116               *dst++ = c;
2117             continue;
2118
2119           label_end_of_loop:
2120             coding->carryover_size = src - src_base;
2121             bcopy (src_base, coding->carryover, coding->carryover_size);
2122             src = src_base;
2123             break;
2124           }
2125         *consumed = src - source;
2126         produced = dst - destination;
2127         break;
2128       }
2129
2130     case CODING_EOL_CR:
2131       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2132       bcopy (source, destination, produced);
2133       dst_end = destination + produced;
2134       while (dst < dst_end)
2135         if (*dst++ == '\r') dst[-1] = '\n';
2136       *consumed = produced;
2137       break;
2138
2139     default:                    /* i.e. case: CODING_EOL_LF */
2140       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2141       bcopy (source, destination, produced);
2142       *consumed = produced;
2143       break;
2144     }
2145
2146   return produced;
2147 }
2148
2149 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2150    format of end-of-line according to `coding->eol_type'.  If
2151    `coding->selective' is 1, code '\r' in source text also means
2152    end-of-line.  */
2153
2154 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2155      struct coding_system *coding;
2156      unsigned char *source, *destination;
2157      int src_bytes, dst_bytes;
2158      int *consumed;
2159 {
2160   unsigned char *src = source;
2161   unsigned char *dst = destination;
2162   int produced;
2163
2164   if (src_bytes <= 0)
2165     return 0;
2166
2167   switch (coding->eol_type)
2168     {
2169     case CODING_EOL_LF:
2170     case CODING_EOL_UNDECIDED:
2171       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2172       bcopy (source, destination, produced);
2173       if (coding->selective)
2174         {
2175           int i = produced;
2176           while (i--)
2177             if (*dst++ == '\r') dst[-1] = '\n';
2178         }
2179       *consumed = produced;
2180
2181     case CODING_EOL_CRLF:
2182       {
2183         unsigned char c;
2184         unsigned char *src_end = source + src_bytes;
2185         unsigned char *dst_end = destination + dst_bytes;
2186         /* Since the maximum bytes produced by each loop is 2, we
2187            subtract 1 from DST_END to assure overflow checking is
2188            necessary only at the head of loop.  */
2189         unsigned char *adjusted_dst_end = dst_end - 1;
2190
2191         while (src < src_end && dst < adjusted_dst_end)
2192           {
2193             c = *src++;
2194             if (c == '\n' || (c == '\r' && coding->selective))
2195               *dst++ = '\r', *dst++ = '\n';
2196             else
2197               *dst++ = c;
2198           }
2199         produced = dst - destination;
2200         *consumed = src - source;
2201         break;
2202       }
2203
2204     default:                    /* i.e. case CODING_EOL_CR: */
2205       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2206       bcopy (source, destination, produced);
2207       {
2208         int i = produced;
2209         while (i--)
2210           if (*dst++ == '\n') dst[-1] = '\r';
2211       }
2212       *consumed = produced;
2213     }
2214
2215   return produced;
2216 }
2217
2218 \f
2219 /*** 6. C library functions ***/
2220
2221 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2222    has a property `coding-system'.  The value of this property is a
2223    vector of length 5 (called as coding-vector).  Among elements of
2224    this vector, the first (element[0]) and the fifth (element[4])
2225    carry important information for decoding/encoding.  Before
2226    decoding/encoding, this information should be set in fields of a
2227    structure of type `coding_system'.
2228
2229    A value of property `coding-system' can be a symbol of another
2230    subsidiary coding-system.  In that case, Emacs gets coding-vector
2231    from that symbol.
2232
2233    `element[0]' contains information to be set in `coding->type'.  The
2234    value and its meaning is as follows:
2235
2236    0 -- coding_type_emacs_mule
2237    1 -- coding_type_sjis
2238    2 -- coding_type_iso2022
2239    3 -- coding_type_big5
2240    4 -- coding_type_ccl encoder/decoder written in CCL
2241    nil -- coding_type_no_conversion
2242    t -- coding_type_undecided (automatic conversion on decoding,
2243                                no-conversion on encoding)
2244
2245    `element[4]' contains information to be set in `coding->flags' and
2246    `coding->spec'.  The meaning varies by `coding->type'.
2247
2248    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2249    of length 32 (of which the first 13 sub-elements are used now).
2250    Meanings of these sub-elements are:
2251
2252    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2253         If the value is an integer of valid charset, the charset is
2254         assumed to be designated to graphic register N initially.
2255
2256         If the value is minus, it is a minus value of charset which
2257         reserves graphic register N, which means that the charset is
2258         not designated initially but should be designated to graphic
2259         register N just before encoding a character in that charset.
2260
2261         If the value is nil, graphic register N is never used on
2262         encoding.
2263
2264    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2265         Each value takes t or nil.  See the section ISO2022 of
2266         `coding.h' for more information.
2267
2268    If `coding->type' is `coding_type_big5', element[4] is t to denote
2269    BIG5-ETen or nil to denote BIG5-HKU.
2270
2271    If `coding->type' takes the other value, element[4] is ignored.
2272
2273    Emacs Lisp's coding system also carries information about format of
2274    end-of-line in a value of property `eol-type'.  If the value is
2275    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2276    means CODING_EOL_CR.  If it is not integer, it should be a vector
2277    of subsidiary coding systems of which property `eol-type' has one
2278    of above values.
2279
2280 */
2281
2282 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2283    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2284    is setup so that no conversion is necessary and return -1, else
2285    return 0.  */
2286
2287 int
2288 setup_coding_system (coding_system, coding)
2289      Lisp_Object coding_system;
2290      struct coding_system *coding;
2291 {
2292   Lisp_Object type, eol_type;
2293
2294   /* At first, set several fields to default values.  */
2295   coding->require_flushing = 0;
2296   coding->last_block = 0;
2297   coding->selective = 0;
2298   coding->composing = 0;
2299   coding->direction = 0;
2300   coding->carryover_size = 0;
2301   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2302   coding->character_unification_table_for_decode = Qnil;
2303   coding->character_unification_table_for_encode = Qnil;
2304
2305   Vlast_coding_system_used = coding->symbol = coding_system;
2306   eol_type = Qnil;
2307   /* Get value of property `coding-system' until we get a vector.
2308      While doing that, also get values of properties
2309      `post-read-conversion', `pre-write-conversion',
2310      `character-unification-table-for-decode',
2311      `character-unification-table-for-encode' and `eol-type'.  */
2312   while (!NILP (coding_system) && SYMBOLP (coding_system))
2313     {
2314       if (NILP (coding->post_read_conversion))
2315         coding->post_read_conversion = Fget (coding_system,
2316                                              Qpost_read_conversion);
2317       if (NILP (coding->pre_write_conversion))
2318         coding->pre_write_conversion = Fget (coding_system,
2319                                              Qpre_write_conversion);
2320       if (!inhibit_eol_conversion && NILP (eol_type))
2321         eol_type = Fget (coding_system, Qeol_type);
2322
2323       if (NILP (coding->character_unification_table_for_decode))
2324         coding->character_unification_table_for_decode
2325           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2326
2327       if (NILP (coding->character_unification_table_for_encode))
2328         coding->character_unification_table_for_encode
2329           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2330
2331       coding_system = Fget (coding_system, Qcoding_system);
2332     }
2333
2334   while (!NILP (coding->character_unification_table_for_decode)
2335          && SYMBOLP (coding->character_unification_table_for_decode))
2336         coding->character_unification_table_for_decode
2337           = Fget (coding->character_unification_table_for_decode,
2338                   Qcharacter_unification_table_for_decode);
2339   if (!NILP (coding->character_unification_table_for_decode)
2340       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2341       coding->character_unification_table_for_decode = Qnil;
2342
2343   while (!NILP (coding->character_unification_table_for_encode)
2344          && SYMBOLP (coding->character_unification_table_for_encode))
2345         coding->character_unification_table_for_encode
2346           = Fget (coding->character_unification_table_for_encode,
2347                   Qcharacter_unification_table_for_encode);
2348   if (!NILP (coding->character_unification_table_for_encode)
2349       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2350       coding->character_unification_table_for_encode = Qnil;
2351
2352   if (!VECTORP (coding_system)
2353       || XVECTOR (coding_system)->size != 5)
2354     goto label_invalid_coding_system;
2355
2356   if (VECTORP (eol_type))
2357     coding->eol_type = CODING_EOL_UNDECIDED;
2358   else if (XFASTINT (eol_type) == 1)
2359     coding->eol_type = CODING_EOL_CRLF;
2360   else if (XFASTINT (eol_type) == 2)
2361     coding->eol_type = CODING_EOL_CR;
2362   else
2363     coding->eol_type = CODING_EOL_LF;
2364
2365   type = XVECTOR (coding_system)->contents[0];
2366   switch (XFASTINT (type))
2367     {
2368     case 0:
2369       coding->type = coding_type_emacs_mule;
2370       break;
2371
2372     case 1:
2373       coding->type = coding_type_sjis;
2374       break;
2375
2376     case 2:
2377       coding->type = coding_type_iso2022;
2378       {
2379         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2380         Lisp_Object *flags;
2381         int i, charset, default_reg_bits = 0;
2382
2383         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2384           goto label_invalid_coding_system;
2385
2386         flags = XVECTOR (val)->contents;
2387         coding->flags
2388           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2389              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2390              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2391              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2392              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2393              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2394              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2395              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2396              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2397              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2398              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2399              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2400              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2401              );
2402
2403         /* Invoke graphic register 0 to plane 0.  */
2404         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2405         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2406         CODING_SPEC_ISO_INVOCATION (coding, 1)
2407           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2408         /* Not single shifting at first.  */
2409         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2410         /* Beginning of buffer should also be regarded as bol. */
2411         CODING_SPEC_ISO_BOL (coding) = 1;
2412
2413         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2414            FLAGS[REG] can be one of below:
2415                 integer CHARSET: CHARSET occupies register I,
2416                 t: designate nothing to REG initially, but can be used
2417                   by any charsets,
2418                 list of integer, nil, or t: designate the first
2419                   element (if integer) to REG initially, the remaining
2420                   elements (if integer) is designated to REG on request,
2421                   if an element is t, REG can be used by any charset,
2422                 nil: REG is never used.  */
2423         for (charset = 0; charset <= MAX_CHARSET; charset++)
2424           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2425             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2426         bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
2427         for (i = 0; i < 4; i++)
2428           {
2429             if (INTEGERP (flags[i])
2430                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2431                 || (charset = get_charset_id (flags[i])) >= 0)
2432               {
2433                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2434                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2435                 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2436               }
2437             else if (EQ (flags[i], Qt))
2438               {
2439                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2440                 default_reg_bits |= 1 << i;
2441               }
2442             else if (CONSP (flags[i]))
2443               {
2444                 Lisp_Object tail = flags[i];
2445
2446                 if (INTEGERP (XCONS (tail)->car)
2447                     && (charset = XINT (XCONS (tail)->car),
2448                         CHARSET_VALID_P (charset))
2449                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2450                   {
2451                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2452                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2453                     CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2454                   }
2455                 else
2456                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2457                 tail = XCONS (tail)->cdr;
2458                 while (CONSP (tail))
2459                   {
2460                     if (INTEGERP (XCONS (tail)->car)
2461                         && (charset = XINT (XCONS (tail)->car),
2462                             CHARSET_VALID_P (charset))
2463                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2464                       {
2465                         CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2466                           = i;
2467                         CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2468                           = 1;
2469                       }
2470                     else if (EQ (XCONS (tail)->car, Qt))
2471                       default_reg_bits |= 1 << i;
2472                     tail = XCONS (tail)->cdr;
2473                   }
2474               }
2475             else
2476               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2477
2478             CODING_SPEC_ISO_DESIGNATION (coding, i)
2479               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2480           }
2481
2482         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2483           {
2484             /* REG 1 can be used only by locking shift in 7-bit env.  */
2485             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2486               default_reg_bits &= ~2;
2487             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2488               /* Without any shifting, only REG 0 and 1 can be used.  */
2489               default_reg_bits &= 3;
2490           }
2491
2492         for (charset = 0; charset <= MAX_CHARSET; charset++)
2493           if (CHARSET_VALID_P (charset)
2494               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2495                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2496             {
2497               /* We have not yet decided where to designate CHARSET.  */
2498               int reg_bits = default_reg_bits;
2499
2500               if (CHARSET_CHARS (charset) == 96)
2501                 /* A charset of CHARS96 can't be designated to REG 0.  */
2502                 reg_bits &= ~1;
2503
2504               if (reg_bits)
2505                 /* There exist some default graphic register.  */
2506                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2507                   = (reg_bits & 1
2508                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2509               else
2510                 /* We anyway have to designate CHARSET to somewhere.  */
2511                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2512                   = (CHARSET_CHARS (charset) == 94
2513                      ? 0
2514                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2515                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2516                         ? 1
2517                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2518                            ? 2 : 0)));
2519             }
2520       }
2521       coding->require_flushing = 1;
2522       break;
2523
2524     case 3:
2525       coding->type = coding_type_big5;
2526       coding->flags
2527         = (NILP (XVECTOR (coding_system)->contents[4])
2528            ? CODING_FLAG_BIG5_HKU
2529            : CODING_FLAG_BIG5_ETEN);
2530       break;
2531
2532     case 4:
2533       coding->type = coding_type_ccl;
2534       {
2535         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2536         if (CONSP  (val)
2537             && VECTORP (XCONS (val)->car)
2538             && VECTORP (XCONS (val)->cdr))
2539           {
2540             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2541             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2542           }
2543         else
2544           goto label_invalid_coding_system;
2545       }
2546       coding->require_flushing = 1;
2547       break;
2548
2549     default:
2550       if (EQ (type, Qt))
2551         coding->type = coding_type_undecided;
2552       else
2553         coding->type = coding_type_no_conversion;
2554       break;
2555     }
2556   return 0;
2557
2558  label_invalid_coding_system:
2559   coding->type = coding_type_no_conversion;
2560   coding->eol_type = CODING_EOL_LF;
2561   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2562     = Qnil;
2563   return -1;
2564 }
2565
2566 /* Emacs has a mechanism to automatically detect a coding system if it
2567    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2568    it's impossible to distinguish some coding systems accurately
2569    because they use the same range of codes.  So, at first, coding
2570    systems are categorized into 7, those are:
2571
2572    o coding-category-emacs-mule
2573
2574         The category for a coding system which has the same code range
2575         as Emacs' internal format.  Assigned the coding-system (Lisp
2576         symbol) `emacs-mule' by default.
2577
2578    o coding-category-sjis
2579
2580         The category for a coding system which has the same code range
2581         as SJIS.  Assigned the coding-system (Lisp
2582         symbol) `japanese-shift-jis' by default.
2583
2584    o coding-category-iso-7
2585
2586         The category for a coding system which has the same code range
2587         as ISO2022 of 7-bit environment.  This doesn't use any locking
2588         shift and single shift functions.  Assigned the coding-system
2589         (Lisp symbol) `iso-2022-7bit' by default.
2590
2591    o coding-category-iso-8-1
2592
2593         The category for a coding system which has the same code range
2594         as ISO2022 of 8-bit environment and graphic plane 1 used only
2595         for DIMENSION1 charset.  This doesn't use any locking shift
2596         and single shift functions.  Assigned the coding-system (Lisp
2597         symbol) `iso-latin-1' by default.
2598
2599    o coding-category-iso-8-2
2600
2601         The category for a coding system which has the same code range
2602         as ISO2022 of 8-bit environment and graphic plane 1 used only
2603         for DIMENSION2 charset.  This doesn't use any locking shift
2604         and single shift functions.  Assigned the coding-system (Lisp
2605         symbol) `japanese-iso-8bit' by default.
2606
2607    o coding-category-iso-7-else
2608
2609         The category for a coding system which has the same code range
2610         as ISO2022 of 7-bit environemnt but uses locking shift or
2611         single shift functions.  Assigned the coding-system (Lisp
2612         symbol) `iso-2022-7bit-lock' by default.
2613
2614    o coding-category-iso-8-else
2615
2616         The category for a coding system which has the same code range
2617         as ISO2022 of 8-bit environemnt but uses locking shift or
2618         single shift functions.  Assigned the coding-system (Lisp
2619         symbol) `iso-2022-8bit-ss2' by default.
2620
2621    o coding-category-big5
2622
2623         The category for a coding system which has the same code range
2624         as BIG5.  Assigned the coding-system (Lisp symbol)
2625         `cn-big5' by default.
2626
2627    o coding-category-binary
2628
2629         The category for a coding system not categorized in any of the
2630         above.  Assigned the coding-system (Lisp symbol)
2631         `no-conversion' by default.
2632
2633    Each of them is a Lisp symbol and the value is an actual
2634    `coding-system's (this is also a Lisp symbol) assigned by a user.
2635    What Emacs does actually is to detect a category of coding system.
2636    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2637    decide only one possible category, it selects a category of the
2638    highest priority.  Priorities of categories are also specified by a
2639    user in a Lisp variable `coding-category-list'.
2640
2641 */
2642
2643 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2644    If it detects possible coding systems, return an integer in which
2645    appropriate flag bits are set.  Flag bits are defined by macros
2646    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2647
2648 int
2649 detect_coding_mask (src, src_bytes)
2650      unsigned char *src;
2651      int src_bytes;
2652 {
2653   register unsigned char c;
2654   unsigned char *src_end = src + src_bytes;
2655   int mask;
2656
2657   /* At first, skip all ASCII characters and control characters except
2658      for three ISO2022 specific control characters.  */
2659  label_loop_detect_coding:
2660   while (src < src_end)
2661     {
2662       c = *src;
2663       if (c >= 0x80
2664           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2665         break;
2666       src++;
2667     }
2668
2669   if (src >= src_end)
2670     /* We found nothing other than ASCII.  There's nothing to do.  */
2671     return CODING_CATEGORY_MASK_ANY;
2672
2673   /* The text seems to be encoded in some multilingual coding system.
2674      Now, try to find in which coding system the text is encoded.  */
2675   if (c < 0x80)
2676     {
2677       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2678       /* C is an ISO2022 specific control code of C0.  */
2679       mask = detect_coding_iso2022 (src, src_end);
2680       src++;
2681       if (mask == CODING_CATEGORY_MASK_ANY)
2682         /* No valid ISO2022 code follows C.  Try again.  */
2683         goto label_loop_detect_coding;
2684     }
2685   else if (c < 0xA0)
2686     {
2687       /* If C is a special latin extra code,
2688          or is an ISO2022 specific control code of C1 (SS2 or SS3),
2689          or is an ISO2022 control-sequence-introducer (CSI),
2690          we should also consider the possibility of someof ISO2022 codings.  */
2691       if ((VECTORP (Vlatin_extra_code_table)
2692            && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2693           || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2694           || (c == ISO_CODE_CSI
2695               && (src < src_end
2696                   && (*src == ']'
2697                       || (src + 1 < src_end
2698                           && src[1] == ']'
2699                           && (*src == '0' || *src == '1' || *src == '2'))))))
2700         mask = (detect_coding_iso2022 (src, src_end)
2701                 | detect_coding_sjis (src, src_end)
2702                 | detect_coding_emacs_mule (src, src_end)
2703                 | CODING_CATEGORY_MASK_BINARY);
2704
2705       else
2706         /* C is the first byte of SJIS character code, or a
2707            leading-code of Emacs.  */
2708         mask = (detect_coding_sjis (src, src_end)
2709                 | detect_coding_emacs_mule (src, src_end)
2710                 | CODING_CATEGORY_MASK_BINARY);
2711     }
2712   else
2713     /* C is a character of ISO2022 in graphic plane right,
2714        or a SJIS's 1-byte character code (i.e. JISX0201),
2715        or the first byte of BIG5's 2-byte code.  */
2716     mask = (detect_coding_iso2022 (src, src_end)
2717             | detect_coding_sjis (src, src_end)
2718             | detect_coding_big5 (src, src_end)
2719             | CODING_CATEGORY_MASK_BINARY);
2720
2721   return mask;
2722 }
2723
2724 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2725    The information of the detected coding system is set in CODING.  */
2726
2727 void
2728 detect_coding (coding, src, src_bytes)
2729      struct coding_system *coding;
2730      unsigned char *src;
2731      int src_bytes;
2732 {
2733   int mask = detect_coding_mask (src, src_bytes);
2734   int idx;
2735
2736   if (mask == CODING_CATEGORY_MASK_ANY)
2737     /* We found nothing other than ASCII.  There's nothing to do.  */
2738     return;
2739
2740   if (!mask)
2741     /* The source text seems to be encoded in unknown coding system.
2742        Emacs regards the category of such a kind of coding system as
2743        `coding-category-binary'.  We assume that a user has assigned
2744        an appropriate coding system for a `coding-category-binary'.  */
2745     idx = CODING_CATEGORY_IDX_BINARY;
2746   else
2747     {
2748       /* We found some plausible coding systems.  Let's use a coding
2749          system of the highest priority.  */
2750       Lisp_Object val = Vcoding_category_list;
2751
2752       if (CONSP (val))
2753         while (!NILP (val))
2754           {
2755             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2756             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2757               break;
2758             val = XCONS (val)->cdr;
2759           }
2760       else
2761         val = Qnil;
2762
2763       if (NILP (val))
2764         {
2765           /* For unknown reason, `Vcoding_category_list' contains none
2766              of found categories.  Let's use any of them.  */
2767           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2768             if (mask & (1 << idx))
2769               break;
2770         }
2771     }
2772   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2773 }
2774
2775 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2776    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2777    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2778
2779 #define MAX_EOL_CHECK_COUNT 3
2780
2781 int
2782 detect_eol_type (src, src_bytes)
2783      unsigned char *src;
2784      int src_bytes;
2785 {
2786   unsigned char *src_end = src + src_bytes;
2787   unsigned char c;
2788   int total = 0;                /* How many end-of-lines are found so far.  */
2789   int eol_type = CODING_EOL_UNDECIDED;
2790   int this_eol_type;
2791
2792   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2793     {
2794       c = *src++;
2795       if (c == '\n' || c == '\r')
2796         {
2797           total++;
2798           if (c == '\n')
2799             this_eol_type = CODING_EOL_LF;
2800           else if (src >= src_end || *src != '\n')
2801             this_eol_type = CODING_EOL_CR;
2802           else
2803             this_eol_type = CODING_EOL_CRLF, src++;
2804
2805           if (eol_type == CODING_EOL_UNDECIDED)
2806             /* This is the first end-of-line.  */
2807             eol_type = this_eol_type;
2808           else if (eol_type != this_eol_type)
2809             /* The found type is different from what found before.
2810                We had better not decode end-of-line.  */
2811             return CODING_EOL_LF;
2812         }
2813     }
2814
2815   return eol_type;
2816 }
2817
2818 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2819    is encoded.  If it detects an appropriate format of end-of-line, it
2820    sets the information in *CODING.  */
2821
2822 void
2823 detect_eol (coding, src, src_bytes)
2824      struct coding_system *coding;
2825      unsigned char *src;
2826      int src_bytes;
2827 {
2828   Lisp_Object val, coding_system;
2829   int eol_type = detect_eol_type (src, src_bytes);
2830
2831   if (eol_type == CODING_EOL_UNDECIDED)
2832     /*  We found no end-of-line in the source text.  */
2833     return;
2834
2835   coding_system = coding->symbol;
2836   while (!NILP (coding_system)
2837          && NILP (val = Fget (coding_system, Qeol_type)))
2838     coding_system = Fget (coding_system, Qcoding_system);
2839   if (VECTORP (val) && XVECTOR (val)->size == 3)
2840     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2841 }
2842
2843 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2844    decoding, it may detect coding system and format of end-of-line if
2845    those are not yet decided.  */
2846
2847 int
2848 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2849      struct coding_system *coding;
2850      unsigned char *source, *destination;
2851      int src_bytes, dst_bytes;
2852      int *consumed;
2853 {
2854   int produced;
2855
2856   if (src_bytes <= 0)
2857     {
2858       *consumed = 0;
2859       return 0;
2860     }
2861
2862   if (coding->type == coding_type_undecided)
2863     detect_coding (coding, source, src_bytes);
2864
2865   if (coding->eol_type == CODING_EOL_UNDECIDED)
2866     detect_eol (coding, source, src_bytes);
2867
2868   coding->carryover_size = 0;
2869   switch (coding->type)
2870     {
2871     case coding_type_no_conversion:
2872     label_no_conversion:
2873       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2874       bcopy (source, destination, produced);
2875       *consumed = produced;
2876       break;
2877
2878     case coding_type_emacs_mule:
2879     case coding_type_undecided:
2880       if (coding->eol_type == CODING_EOL_LF
2881           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2882         goto label_no_conversion;
2883       produced = decode_eol (coding, source, destination,
2884                              src_bytes, dst_bytes, consumed);
2885       break;
2886
2887     case coding_type_sjis:
2888       produced = decode_coding_sjis_big5 (coding, source, destination,
2889                                           src_bytes, dst_bytes, consumed,
2890                                           1);
2891       break;
2892
2893     case coding_type_iso2022:
2894       produced = decode_coding_iso2022 (coding, source, destination,
2895                                         src_bytes, dst_bytes, consumed);
2896       break;
2897
2898     case coding_type_big5:
2899       produced = decode_coding_sjis_big5 (coding, source, destination,
2900                                           src_bytes, dst_bytes, consumed,
2901                                           0);
2902       break;
2903
2904     case coding_type_ccl:
2905       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2906                              src_bytes, dst_bytes, consumed);
2907       break;
2908     }
2909
2910   return produced;
2911 }
2912
2913 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2914
2915 int
2916 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2917      struct coding_system *coding;
2918      unsigned char *source, *destination;
2919      int src_bytes, dst_bytes;
2920      int *consumed;
2921 {
2922   int produced;
2923
2924   switch (coding->type)
2925     {
2926     case coding_type_no_conversion:
2927     label_no_conversion:
2928       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2929       if (produced > 0)
2930         {
2931           bcopy (source, destination, produced);
2932           if (coding->selective)
2933             {
2934               unsigned char *p = destination, *pend = destination + produced;
2935               while (p < pend)
2936                 if (*p++ == '\015') p[-1] = '\n';
2937             }
2938         }
2939       *consumed = produced;
2940       break;
2941
2942     case coding_type_emacs_mule:
2943     case coding_type_undecided:
2944       if (coding->eol_type == CODING_EOL_LF
2945           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2946         goto label_no_conversion;
2947       produced = encode_eol (coding, source, destination,
2948                              src_bytes, dst_bytes, consumed);
2949       break;
2950
2951     case coding_type_sjis:
2952       produced = encode_coding_sjis_big5 (coding, source, destination,
2953                                           src_bytes, dst_bytes, consumed,
2954                                           1);
2955       break;
2956
2957     case coding_type_iso2022:
2958       produced = encode_coding_iso2022 (coding, source, destination,
2959                                         src_bytes, dst_bytes, consumed);
2960       break;
2961
2962     case coding_type_big5:
2963       produced = encode_coding_sjis_big5 (coding, source, destination,
2964                                           src_bytes, dst_bytes, consumed,
2965                                           0);
2966       break;
2967
2968     case coding_type_ccl:
2969       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2970                              src_bytes, dst_bytes, consumed);
2971       break;
2972     }
2973
2974   return produced;
2975 }
2976
2977 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2978
2979 /* Return maximum size (bytes) of a buffer enough for decoding
2980    SRC_BYTES of text encoded in CODING.  */
2981
2982 int
2983 decoding_buffer_size (coding, src_bytes)
2984      struct coding_system *coding;
2985      int src_bytes;
2986 {
2987   int magnification;
2988
2989   if (coding->type == coding_type_iso2022)
2990     magnification = 3;
2991   else if (coding->type == coding_type_ccl)
2992     magnification = coding->spec.ccl.decoder.buf_magnification;
2993   else
2994     magnification = 2;
2995
2996   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2997 }
2998
2999 /* Return maximum size (bytes) of a buffer enough for encoding
3000    SRC_BYTES of text to CODING.  */
3001
3002 int
3003 encoding_buffer_size (coding, src_bytes)
3004      struct coding_system *coding;
3005      int src_bytes;
3006 {
3007   int magnification;
3008
3009   if (coding->type == coding_type_ccl)
3010     magnification = coding->spec.ccl.encoder.buf_magnification;
3011   else
3012     magnification = 3;
3013
3014   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3015 }
3016
3017 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3018 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3019 #endif
3020
3021 char *conversion_buffer;
3022 int conversion_buffer_size;
3023
3024 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3025    or decoding.  Sufficient memory is allocated automatically.  If we
3026    run out of memory, return NULL.  */
3027
3028 char *
3029 get_conversion_buffer (size)
3030      int size;
3031 {
3032   if (size > conversion_buffer_size)
3033     {
3034       char *buf;
3035       int real_size = conversion_buffer_size * 2;
3036
3037       while (real_size < size) real_size *= 2;
3038       buf = (char *) xmalloc (real_size);
3039       xfree (conversion_buffer);
3040       conversion_buffer = buf;
3041       conversion_buffer_size = real_size;
3042     }
3043   return conversion_buffer;
3044 }
3045
3046 \f
3047 #ifdef emacs
3048 /*** 7. Emacs Lisp library functions ***/
3049
3050 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3051        1, 1, 0,
3052   "Return coding-spec of CODING-SYSTEM.\n\
3053 If CODING-SYSTEM is not a valid coding-system, return nil.")
3054   (obj)
3055      Lisp_Object obj;
3056 {
3057   while (SYMBOLP (obj) && !NILP (obj))
3058     obj = Fget (obj, Qcoding_system);
3059   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3060           ? Qnil : obj);
3061 }
3062
3063 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3064   "Return t if OBJECT is nil or a coding-system.\n\
3065 See document of make-coding-system for coding-system object.")
3066   (obj)
3067      Lisp_Object obj;
3068 {
3069   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3070 }
3071
3072 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3073        Sread_non_nil_coding_system, 1, 1, 0,
3074   "Read a coding system from the minibuffer, prompting with string PROMPT.")
3075   (prompt)
3076      Lisp_Object prompt;
3077 {
3078   Lisp_Object val;
3079   do
3080     {
3081       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3082                               Qt, Qnil, Qnil, Qnil);
3083     }
3084   while (XSTRING (val)->size == 0);
3085   return (Fintern (val, Qnil));
3086 }
3087
3088 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3089   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3090   (prompt)
3091      Lisp_Object prompt;
3092 {
3093   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3094                                       Qt, Qnil, Qnil, Qnil);
3095   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3096 }
3097
3098 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3099        1, 1, 0,
3100   "Check validity of CODING-SYSTEM.\n\
3101 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3102 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3103 The value of property should be a vector of length 5.")
3104   (coding_system)
3105      Lisp_Object coding_system;
3106 {
3107   CHECK_SYMBOL (coding_system, 0);
3108   if (!NILP (Fcoding_system_p (coding_system)))
3109     return coding_system;
3110   while (1)
3111     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3112 }
3113
3114 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3115        2, 2, 0,
3116   "Detect coding system of the text in the region between START and END.\n\
3117 Return a list of possible coding systems ordered by priority.\n\
3118 If only ASCII characters are found, it returns `undecided'\n\
3119  or its subsidiary coding system according to a detected end-of-line format.")
3120   (b, e)
3121      Lisp_Object b, e;
3122 {
3123   int coding_mask, eol_type;
3124   Lisp_Object val;
3125   int beg, end;
3126
3127   validate_region (&b, &e);
3128   beg = XINT (b), end = XINT (e);
3129   if (beg < GPT && end >= GPT) move_gap (end);
3130
3131   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3132   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
3133
3134   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3135     {
3136       val = intern ("undecided");
3137       if (eol_type != CODING_EOL_UNDECIDED)
3138         {
3139           Lisp_Object val2 = Fget (val, Qeol_type);
3140           if (VECTORP (val2))
3141             val = XVECTOR (val2)->contents[eol_type];
3142         }
3143     }
3144   else
3145     {
3146       Lisp_Object val2;
3147
3148       /* At first, gather possible coding-systems in VAL in a reverse
3149          order.  */
3150       val = Qnil;
3151       for (val2 = Vcoding_category_list;
3152            !NILP (val2);
3153            val2 = XCONS (val2)->cdr)
3154         {
3155           int idx
3156             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3157           if (coding_mask & (1 << idx))
3158             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3159         }
3160
3161       /* Then, change the order of the list, while getting subsidiary
3162          coding-systems.  */
3163       val2 = val;
3164       val = Qnil;
3165       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3166         {
3167           if (eol_type == CODING_EOL_UNDECIDED)
3168             val = Fcons (XCONS (val2)->car, val);
3169           else
3170             {
3171               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3172               if (VECTORP (val3))
3173                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3174               else
3175                 val = Fcons (XCONS (val2)->car, val);
3176             }
3177         }
3178     }
3179
3180   return val;
3181 }
3182
3183 /* Scan text in the region between *BEGP and *ENDP, skip characters
3184    which we never have to encode to (iff ENCODEP is 1) or decode from
3185    coding system CODING at the head and tail, then set BEGP and ENDP
3186    to the addresses of start and end of the text we actually convert.  */
3187
3188 void
3189 shrink_conversion_area (begp, endp, coding, encodep)
3190      unsigned char **begp, **endp;
3191      struct coding_system *coding;
3192      int encodep;
3193 {
3194   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3195
3196   if (coding->eol_type != CODING_EOL_LF
3197       && coding->eol_type != CODING_EOL_UNDECIDED)
3198     /* Since we anyway have to convert end-of-line format, it is not
3199        worth skipping at most 100 bytes or so.  */
3200     return;
3201
3202   if (encodep)                  /* for encoding */
3203     {
3204       switch (coding->type)
3205         {
3206         case coding_type_no_conversion:
3207         case coding_type_emacs_mule:
3208         case coding_type_undecided:
3209           /* We need no conversion.  */
3210           *begp = *endp;
3211           return;
3212         case coding_type_ccl:
3213           /* We can't skip any data.  */
3214           return;
3215         case coding_type_iso2022:
3216           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3217             {
3218               unsigned char *bol = beg_addr;
3219               while (beg_addr < end_addr && *beg_addr < 0x80)
3220                 {
3221                   beg_addr++;
3222                   if (*(beg_addr - 1) == '\n')
3223                     bol = beg_addr;
3224                 }
3225               beg_addr = bol;
3226               goto label_skip_tail;
3227             }
3228           /* fall down ... */
3229         default:
3230           /* We can skip all ASCII characters at the head and tail.  */
3231           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3232         label_skip_tail:
3233           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3234           break;
3235         }
3236     }
3237   else                          /* for decoding */
3238     {
3239       switch (coding->type)
3240         {
3241         case coding_type_no_conversion:
3242           /* We need no conversion.  */
3243           *begp = *endp;
3244           return;
3245         case coding_type_emacs_mule:
3246           if (coding->eol_type == CODING_EOL_LF)
3247             {
3248               /* We need no conversion.  */
3249               *begp = *endp;
3250               return;
3251             }
3252           /* We can skip all but carriage-return.  */
3253           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3254           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3255           break;
3256         case coding_type_sjis:
3257         case coding_type_big5:
3258           /* We can skip all ASCII characters at the head.  */
3259           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3260           /* We can skip all ASCII characters at the tail except for
3261              the second byte of SJIS or BIG5 code.  */
3262           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3263           if (end_addr != *endp)
3264             end_addr++;
3265           break;
3266         case coding_type_ccl:
3267           /* We can't skip any data.  */
3268           return;
3269         default:                /* i.e. case coding_type_iso2022: */
3270           {
3271             unsigned char c;
3272
3273             /* We can skip all ASCII characters except for a few
3274                control codes at the head.  */
3275             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3276                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3277                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3278               beg_addr++;
3279           }
3280           break;
3281         }
3282     }
3283   *begp = beg_addr;
3284   *endp = end_addr;
3285   return;
3286 }
3287
3288 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3289    text between B and E.  B and E are buffer position.  */
3290
3291 Lisp_Object
3292 code_convert_region (b, e, coding, encodep)
3293      Lisp_Object b, e;
3294      struct coding_system *coding;
3295      int encodep;
3296 {
3297   int beg, end, len, consumed, produced;
3298   char *buf;
3299   unsigned char *begp, *endp;
3300   int pos = PT;
3301
3302   validate_region (&b, &e);
3303   beg = XINT (b), end = XINT (e);
3304   if (beg < GPT && end >= GPT)
3305     move_gap (end);
3306
3307   if (encodep && !NILP (coding->pre_write_conversion))
3308     {
3309       /* We must call a pre-conversion function which may put a new
3310          text to be converted in a new buffer.  */
3311       struct buffer *old = current_buffer, *new;
3312
3313       TEMP_SET_PT (beg);
3314       call2 (coding->pre_write_conversion, b, e);
3315       if (old != current_buffer)
3316         {
3317           /* Replace the original text by the text just generated.  */
3318           len = ZV - BEGV;
3319           new = current_buffer;
3320           set_buffer_internal (old);
3321           del_range (beg, end);
3322           insert_from_buffer (new, 1, len, 0);
3323           end = beg + len;
3324         }
3325     }
3326
3327   /* We may be able to shrink the conversion region.  */
3328   begp = POS_ADDR (beg); endp = begp + (end - beg);
3329   shrink_conversion_area (&begp, &endp, coding, encodep);
3330
3331   if (begp == endp)
3332     /* We need no conversion.  */
3333     len = end - beg;
3334   else
3335     {
3336       beg += begp - POS_ADDR (beg);
3337       end =  beg + (endp - begp);
3338
3339       if (encodep)
3340         len = encoding_buffer_size (coding, end - beg);
3341       else
3342         len = decoding_buffer_size (coding, end - beg);
3343       buf = get_conversion_buffer (len);
3344
3345       coding->last_block = 1;
3346       produced = (encodep
3347                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3348                                    &consumed)
3349                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3350                                    &consumed));
3351
3352       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3353
3354       TEMP_SET_PT (beg);
3355       insert (buf, produced);
3356       del_range (PT, PT + end - beg);
3357       if (pos >= end)
3358         pos = PT + (pos - end);
3359       else if (pos > beg)
3360         pos = beg;
3361       TEMP_SET_PT (pos);
3362   }
3363
3364   if (!encodep && !NILP (coding->post_read_conversion))
3365     {
3366       /* We must call a post-conversion function which may alter
3367          the text just converted.  */
3368       Lisp_Object insval;
3369
3370       beg = XINT (b);
3371       TEMP_SET_PT (beg);
3372       insval = call1 (coding->post_read_conversion, make_number (len));
3373       CHECK_NUMBER (insval, 0);
3374       len = XINT (insval);
3375     }
3376
3377   return make_number (len);
3378 }
3379
3380 Lisp_Object
3381 code_convert_string (str, coding, encodep, nocopy)
3382      Lisp_Object str, nocopy;
3383      struct coding_system *coding;
3384      int encodep;
3385 {
3386   int len, consumed, produced;
3387   char *buf;
3388   unsigned char *begp, *endp;
3389   int head_skip, tail_skip;
3390   struct gcpro gcpro1;
3391
3392   if (encodep && !NILP (coding->pre_write_conversion)
3393       || !encodep && !NILP (coding->post_read_conversion))
3394     {
3395       /* Since we have to call Lisp functions which assume target text
3396          is in a buffer, after setting a temporary buffer, call
3397          code_convert_region.  */
3398       int count = specpdl_ptr - specpdl;
3399       int len = XSTRING (str)->size;
3400       Lisp_Object result;
3401       struct buffer *old = current_buffer;
3402
3403       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3404       temp_output_buffer_setup (" *code-converting-work*");
3405       set_buffer_internal (XBUFFER (Vstandard_output));
3406       insert_from_string (str, 0, len, 0);
3407       code_convert_region (make_number (BEGV), make_number (ZV),
3408                            coding, encodep);
3409       result = make_buffer_string (BEGV, ZV, 0);
3410       set_buffer_internal (old);
3411       return unbind_to (count, result);
3412     }
3413
3414   /* We may be able to shrink the conversion region.  */
3415   begp = XSTRING (str)->data;
3416   endp = begp + XSTRING (str)->size;
3417   shrink_conversion_area (&begp, &endp, coding, encodep);
3418
3419   if (begp == endp)
3420     /* We need no conversion.  */
3421     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3422
3423   head_skip = begp - XSTRING (str)->data;
3424   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3425
3426   GCPRO1 (str);
3427
3428   if (encodep)
3429     len = encoding_buffer_size (coding, endp - begp);
3430   else
3431     len = decoding_buffer_size (coding, endp - begp);
3432   buf = get_conversion_buffer (len + head_skip + tail_skip);
3433
3434   bcopy (XSTRING (str)->data, buf, head_skip);
3435   coding->last_block = 1;
3436   produced = (encodep
3437               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3438                                buf + head_skip, endp - begp, len, &consumed)
3439               : decode_coding (coding, XSTRING (str)->data + head_skip,
3440                                buf + head_skip, endp - begp, len, &consumed));
3441   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3442          buf + head_skip + produced,
3443          tail_skip);
3444
3445   UNGCPRO;
3446
3447   return make_string (buf, head_skip + produced + tail_skip);
3448 }
3449
3450 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3451        3, 3, "r\nzCoding system: ",
3452   "Decode current region by specified coding system.\n\
3453 When called from a program, takes three arguments:\n\
3454 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3455 Return length of decoded text.")
3456   (b, e, coding_system)
3457      Lisp_Object b, e, coding_system;
3458 {
3459   struct coding_system coding;
3460
3461   CHECK_NUMBER_COERCE_MARKER (b, 0);
3462   CHECK_NUMBER_COERCE_MARKER (e, 1);
3463   CHECK_SYMBOL (coding_system, 2);
3464
3465   if (NILP (coding_system))
3466     return make_number (XFASTINT (e) - XFASTINT (b));
3467   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3468     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3469
3470   return code_convert_region (b, e, &coding, 0);
3471 }
3472
3473 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3474        3, 3, "r\nzCoding system: ",
3475   "Encode current region by specified coding system.\n\
3476 When called from a program, takes three arguments:\n\
3477 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3478 Return length of encoded text.")
3479   (b, e, coding_system)
3480      Lisp_Object b, e, coding_system;
3481 {
3482   struct coding_system coding;
3483
3484   CHECK_NUMBER_COERCE_MARKER (b, 0);
3485   CHECK_NUMBER_COERCE_MARKER (e, 1);
3486   CHECK_SYMBOL (coding_system, 2);
3487
3488   if (NILP (coding_system))
3489     return make_number (XFASTINT (e) - XFASTINT (b));
3490   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3491     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3492
3493   return code_convert_region (b, e, &coding, 1);
3494 }
3495
3496 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3497        2, 3, 0,
3498   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3499 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3500 of decoding.")
3501   (string, coding_system, nocopy)
3502      Lisp_Object string, coding_system, nocopy;
3503 {
3504   struct coding_system coding;
3505
3506   CHECK_STRING (string, 0);
3507   CHECK_SYMBOL (coding_system, 1);
3508
3509   if (NILP (coding_system))
3510     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3511   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3512     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3513
3514   return code_convert_string (string, &coding, 0, nocopy);
3515 }
3516
3517 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3518        2, 3, 0,
3519   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3520 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3521 of encoding.")
3522   (string, coding_system, nocopy)
3523      Lisp_Object string, coding_system, nocopy;
3524 {
3525   struct coding_system coding;
3526
3527   CHECK_STRING (string, 0);
3528   CHECK_SYMBOL (coding_system, 1);
3529
3530   if (NILP (coding_system))
3531     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3532   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3533     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3534
3535   return code_convert_string (string, &coding, 1, nocopy);
3536 }
3537
3538 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3539   "Decode a JISX0208 character of shift-jis encoding.\n\
3540 CODE is the character code in SJIS.\n\
3541 Return the corresponding character.")
3542   (code)
3543      Lisp_Object code;
3544 {
3545   unsigned char c1, c2, s1, s2;
3546   Lisp_Object val;
3547
3548   CHECK_NUMBER (code, 0);
3549   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3550   DECODE_SJIS (s1, s2, c1, c2);
3551   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3552   return val;
3553 }
3554
3555 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3556   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3557 Return the corresponding character code in SJIS.")
3558   (ch)
3559      Lisp_Object ch;
3560 {
3561   int charset, c1, c2, s1, s2;
3562   Lisp_Object val;
3563
3564   CHECK_NUMBER (ch, 0);
3565   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3566   if (charset == charset_jisx0208)
3567     {
3568       ENCODE_SJIS (c1, c2, s1, s2);
3569       XSETFASTINT (val, (s1 << 8) | s2);
3570     }
3571   else
3572     XSETFASTINT (val, 0);
3573   return val;
3574 }
3575
3576 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3577   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3578 CODE is the character code in BIG5.\n\
3579 Return the corresponding character.")
3580   (code)
3581      Lisp_Object code;
3582 {
3583   int charset;
3584   unsigned char b1, b2, c1, c2;
3585   Lisp_Object val;
3586
3587   CHECK_NUMBER (code, 0);
3588   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3589   DECODE_BIG5 (b1, b2, charset, c1, c2);
3590   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3591   return val;
3592 }
3593
3594 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3595   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3596 Return the corresponding character code in Big5.")
3597   (ch)
3598      Lisp_Object ch;
3599 {
3600   int charset, c1, c2, b1, b2;
3601   Lisp_Object val;
3602
3603   CHECK_NUMBER (ch, 0);
3604   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3605   if (charset == charset_big5_1 || charset == charset_big5_2)
3606     {
3607       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3608       XSETFASTINT (val, (b1 << 8) | b2);
3609     }
3610   else
3611     XSETFASTINT (val, 0);
3612   return val;
3613 }
3614
3615 DEFUN ("set-terminal-coding-system-internal",
3616        Fset_terminal_coding_system_internal,
3617        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3618   (coding_system)
3619      Lisp_Object coding_system;
3620 {
3621   CHECK_SYMBOL (coding_system, 0);
3622   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3623   /* We had better not send unexpected characters to terminal.  */
3624   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3625
3626   return Qnil;
3627 }
3628
3629 DEFUN ("set-safe-terminal-coding-system-internal",
3630        Fset_safe_terminal_coding_system_internal,
3631        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3632   (coding_system)
3633      Lisp_Object coding_system;
3634 {
3635   CHECK_SYMBOL (coding_system, 0);
3636   setup_coding_system (Fcheck_coding_system (coding_system),
3637                        &safe_terminal_coding);
3638   return Qnil;
3639 }
3640
3641 DEFUN ("terminal-coding-system",
3642        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3643   "Return coding-system of your terminal.")
3644   ()
3645 {
3646   return terminal_coding.symbol;
3647 }
3648
3649 DEFUN ("set-keyboard-coding-system-internal",
3650        Fset_keyboard_coding_system_internal,
3651        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3652   (coding_system)
3653      Lisp_Object coding_system;
3654 {
3655   CHECK_SYMBOL (coding_system, 0);
3656   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3657   return Qnil;
3658 }
3659
3660 DEFUN ("keyboard-coding-system",
3661        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3662   "Return coding-system of what is sent from terminal keyboard.")
3663   ()
3664 {
3665   return keyboard_coding.symbol;
3666 }
3667
3668 \f
3669 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3670        Sfind_operation_coding_system,  1, MANY, 0,
3671   "Choose a coding system for an operation based on the target name.\n\
3672 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3673 DECODING-SYSTEM is the coding system to use for decoding\n\
3674 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3675 for encoding (in case OPERATION does encoding).\n\
3676 \n\
3677 The first argument OPERATION specifies an I/O primitive:\n\
3678   For file I/O, `insert-file-contents' or `write-region'.\n\
3679   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3680   For network I/O, `open-network-stream'.\n\
3681 \n\
3682 The remaining arguments should be the same arguments that were passed\n\
3683 to the primitive.  Depending on which primitive, one of those arguments\n\
3684 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3685 whichever argument specifies the file name is TARGET.\n\
3686 \n\
3687 TARGET has a meaning which depends on OPERATION:\n\
3688   For file I/O, TARGET is a file name.\n\
3689   For process I/O, TARGET is a process name.\n\
3690   For network I/O, TARGET is a service name or a port number\n\
3691 \n\
3692 This function looks up what specified for TARGET in,\n\
3693 `file-coding-system-alist', `process-coding-system-alist',\n\
3694 or `network-coding-system-alist' depending on OPERATION.\n\
3695 They may specify a coding system, a cons of coding systems,\n\
3696 or a function symbol to call.\n\
3697 In the last case, we call the function with one argument,\n\
3698 which is a list of all the arguments given to this function.")
3699   (nargs, args)
3700      int nargs;
3701      Lisp_Object *args;
3702 {
3703   Lisp_Object operation, target_idx, target, val;
3704   register Lisp_Object chain;
3705
3706   if (nargs < 2)
3707     error ("Too few arguments");
3708   operation = args[0];
3709   if (!SYMBOLP (operation)
3710       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3711     error ("Invalid first arguement");
3712   if (nargs < 1 + XINT (target_idx))
3713     error ("Too few arguments for operation: %s",
3714            XSYMBOL (operation)->name->data);
3715   target = args[XINT (target_idx) + 1];
3716   if (!(STRINGP (target)
3717         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3718     error ("Invalid %dth argument", XINT (target_idx) + 1);
3719
3720   chain = ((EQ (operation, Qinsert_file_contents)
3721             || EQ (operation, Qwrite_region))
3722            ? Vfile_coding_system_alist
3723            : (EQ (operation, Qopen_network_stream)
3724               ? Vnetwork_coding_system_alist
3725               : Vprocess_coding_system_alist));
3726   if (NILP (chain))
3727     return Qnil;
3728
3729   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3730     {
3731       Lisp_Object elt = XCONS (chain)->car;
3732
3733       if (CONSP (elt)
3734           && ((STRINGP (target)
3735                && STRINGP (XCONS (elt)->car)
3736                && fast_string_match (XCONS (elt)->car, target) >= 0)
3737               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3738         {
3739           val = XCONS (elt)->cdr;
3740           if (CONSP (val))
3741             return val;
3742           if (! SYMBOLP (val))
3743             return Qnil;
3744           if (! NILP (Fcoding_system_p (val)))
3745             return Fcons (val, val);
3746           if (!NILP (Ffboundp (val)))
3747             return call1 (val, Flist (nargs, args));
3748           return Qnil;
3749         }
3750     }
3751   return Qnil;
3752 }
3753
3754 #endif /* emacs */
3755
3756 \f
3757 /*** 8. Post-amble ***/
3758
3759 init_coding_once ()
3760 {
3761   int i;
3762
3763   /* Emacs' internal format specific initialize routine.  */
3764   for (i = 0; i <= 0x20; i++)
3765     emacs_code_class[i] = EMACS_control_code;
3766   emacs_code_class[0x0A] = EMACS_linefeed_code;
3767   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3768   for (i = 0x21 ; i < 0x7F; i++)
3769     emacs_code_class[i] = EMACS_ascii_code;
3770   emacs_code_class[0x7F] = EMACS_control_code;
3771   emacs_code_class[0x80] = EMACS_leading_code_composition;
3772   for (i = 0x81; i < 0xFF; i++)
3773     emacs_code_class[i] = EMACS_invalid_code;
3774   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3775   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3776   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3777   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3778
3779   /* ISO2022 specific initialize routine.  */
3780   for (i = 0; i < 0x20; i++)
3781     iso_code_class[i] = ISO_control_code;
3782   for (i = 0x21; i < 0x7F; i++)
3783     iso_code_class[i] = ISO_graphic_plane_0;
3784   for (i = 0x80; i < 0xA0; i++)
3785     iso_code_class[i] = ISO_control_code;
3786   for (i = 0xA1; i < 0xFF; i++)
3787     iso_code_class[i] = ISO_graphic_plane_1;
3788   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3789   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3790   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3791   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3792   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3793   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3794   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3795   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3796   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3797   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3798
3799   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3800   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3801
3802   setup_coding_system (Qnil, &keyboard_coding);
3803   setup_coding_system (Qnil, &terminal_coding);
3804   setup_coding_system (Qnil, &safe_terminal_coding);
3805
3806 #if defined (MSDOS) || defined (WINDOWSNT)
3807   system_eol_type = CODING_EOL_CRLF;
3808 #else
3809   system_eol_type = CODING_EOL_LF;
3810 #endif
3811 }
3812
3813 #ifdef emacs
3814
3815 syms_of_coding ()
3816 {
3817   Qtarget_idx = intern ("target-idx");
3818   staticpro (&Qtarget_idx);
3819
3820   /* Target FILENAME is the first argument.  */
3821   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3822   /* Target FILENAME is the third argument.  */
3823   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3824
3825   Qcall_process = intern ("call-process");
3826   staticpro (&Qcall_process);
3827   /* Target PROGRAM is the first argument.  */
3828   Fput (Qcall_process, Qtarget_idx, make_number (0));
3829
3830   Qcall_process_region = intern ("call-process-region");
3831   staticpro (&Qcall_process_region);
3832   /* Target PROGRAM is the third argument.  */
3833   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3834
3835   Qstart_process = intern ("start-process");
3836   staticpro (&Qstart_process);
3837   /* Target PROGRAM is the third argument.  */
3838   Fput (Qstart_process, Qtarget_idx, make_number (2));
3839
3840   Qopen_network_stream = intern ("open-network-stream");
3841   staticpro (&Qopen_network_stream);
3842   /* Target SERVICE is the fourth argument.  */
3843   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3844
3845   Qcoding_system = intern ("coding-system");
3846   staticpro (&Qcoding_system);
3847
3848   Qeol_type = intern ("eol-type");
3849   staticpro (&Qeol_type);
3850
3851   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3852   staticpro (&Qbuffer_file_coding_system);
3853
3854   Qpost_read_conversion = intern ("post-read-conversion");
3855   staticpro (&Qpost_read_conversion);
3856
3857   Qpre_write_conversion = intern ("pre-write-conversion");
3858   staticpro (&Qpre_write_conversion);
3859
3860   Qcoding_system_spec = intern ("coding-system-spec");
3861   staticpro (&Qcoding_system_spec);
3862
3863   Qcoding_system_p = intern ("coding-system-p");
3864   staticpro (&Qcoding_system_p);
3865
3866   Qcoding_system_error = intern ("coding-system-error");
3867   staticpro (&Qcoding_system_error);
3868
3869   Fput (Qcoding_system_error, Qerror_conditions,
3870         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3871   Fput (Qcoding_system_error, Qerror_message,
3872         build_string ("Invalid coding system"));
3873
3874   Qcoding_category_index = intern ("coding-category-index");
3875   staticpro (&Qcoding_category_index);
3876
3877   {
3878     int i;
3879     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3880       {
3881         coding_category_table[i] = intern (coding_category_name[i]);
3882         staticpro (&coding_category_table[i]);
3883         Fput (coding_category_table[i], Qcoding_category_index,
3884               make_number (i));
3885       }
3886   }
3887
3888   Qcharacter_unification_table = intern ("character-unification-table");
3889   staticpro (&Qcharacter_unification_table);
3890   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3891         make_number (0));
3892
3893   Qcharacter_unification_table_for_decode
3894     = intern ("character-unification-table-for-decode");
3895   staticpro (&Qcharacter_unification_table_for_decode);
3896
3897   Qcharacter_unification_table_for_encode
3898     = intern ("character-unification-table-for-encode");
3899   staticpro (&Qcharacter_unification_table_for_encode);
3900
3901   Qemacs_mule = intern ("emacs-mule");
3902   staticpro (&Qemacs_mule);
3903
3904   defsubr (&Scoding_system_spec);
3905   defsubr (&Scoding_system_p);
3906   defsubr (&Sread_coding_system);
3907   defsubr (&Sread_non_nil_coding_system);
3908   defsubr (&Scheck_coding_system);
3909   defsubr (&Sdetect_coding_region);
3910   defsubr (&Sdecode_coding_region);
3911   defsubr (&Sencode_coding_region);
3912   defsubr (&Sdecode_coding_string);
3913   defsubr (&Sencode_coding_string);
3914   defsubr (&Sdecode_sjis_char);
3915   defsubr (&Sencode_sjis_char);
3916   defsubr (&Sdecode_big5_char);
3917   defsubr (&Sencode_big5_char);
3918   defsubr (&Sset_terminal_coding_system_internal);
3919   defsubr (&Sset_safe_terminal_coding_system_internal);
3920   defsubr (&Sterminal_coding_system);
3921   defsubr (&Sset_keyboard_coding_system_internal);
3922   defsubr (&Skeyboard_coding_system);
3923   defsubr (&Sfind_operation_coding_system);
3924
3925   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3926     "List of coding-categories (symbols) ordered by priority.");
3927   {
3928     int i;
3929
3930     Vcoding_category_list = Qnil;
3931     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3932       Vcoding_category_list
3933         = Fcons (coding_category_table[i], Vcoding_category_list);
3934   }
3935
3936   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3937     "Specify the coding system for read operations.\n\
3938 It is useful to bind this variable with `let', but do not set it globally.\n\
3939 If the value is a coding system, it is used for decoding on read operation.\n\
3940 If not, an appropriate element is used from one of the coding system alists:\n\
3941 There are three such tables, `file-coding-system-alist',\n\
3942 `process-coding-system-alist', and `network-coding-system-alist'.");
3943   Vcoding_system_for_read = Qnil;
3944
3945   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3946     "Specify the coding system for write operations.\n\
3947 It is useful to bind this variable with `let', but do not set it globally.\n\
3948 If the value is a coding system, it is used for encoding on write operation.\n\
3949 If not, an appropriate element is used from one of the coding system alists:\n\
3950 There are three such tables, `file-coding-system-alist',\n\
3951 `process-coding-system-alist', and `network-coding-system-alist'.");
3952   Vcoding_system_for_write = Qnil;
3953
3954   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3955     "Coding system used in the latest file or process I/O.");
3956   Vlast_coding_system_used = Qnil;
3957
3958   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3959     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3960   inhibit_eol_conversion = 0;
3961
3962   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3963     "Alist to decide a coding system to use for a file I/O operation.\n\
3964 The format is ((PATTERN . VAL) ...),\n\
3965 where PATTERN is a regular expression matching a file name,\n\
3966 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3967 If VAL is a coding system, it is used for both decoding and encoding\n\
3968 the file contents.\n\
3969 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3970 and the cdr part is used for encoding.\n\
3971 If VAL is a function symbol, the function must return a coding system\n\
3972 or a cons of coding systems which are used as above.\n\
3973 \n\
3974 See also the function `find-operation-coding-system'.");
3975   Vfile_coding_system_alist = Qnil;
3976
3977   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3978     "Alist to decide a coding system to use for a process I/O operation.\n\
3979 The format is ((PATTERN . VAL) ...),\n\
3980 where PATTERN is a regular expression matching a program name,\n\
3981 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3982 If VAL is a coding system, it is used for both decoding what received\n\
3983 from the program and encoding what sent to the program.\n\
3984 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3985 and the cdr part is used for encoding.\n\
3986 If VAL is a function symbol, the function must return a coding system\n\
3987 or a cons of coding systems which are used as above.\n\
3988 \n\
3989 See also the function `find-operation-coding-system'.");
3990   Vprocess_coding_system_alist = Qnil;
3991
3992   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3993     "Alist to decide a coding system to use for a network I/O operation.\n\
3994 The format is ((PATTERN . VAL) ...),\n\
3995 where PATTERN is a regular expression matching a network service name\n\
3996 or is a port number to connect to,\n\
3997 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3998 If VAL is a coding system, it is used for both decoding what received\n\
3999 from the network stream and encoding what sent to the network stream.\n\
4000 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4001 and the cdr part is used for encoding.\n\
4002 If VAL is a function symbol, the function must return a coding system\n\
4003 or a cons of coding systems which are used as above.\n\
4004 \n\
4005 See also the function `find-operation-coding-system'.");
4006   Vnetwork_coding_system_alist = Qnil;
4007
4008   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4009     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4010   eol_mnemonic_unix = ':';
4011
4012   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4013     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4014   eol_mnemonic_dos = '\\';
4015
4016   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4017     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4018   eol_mnemonic_mac = '/';
4019
4020   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4021     "Mnemonic character indicating end-of-line format is not yet decided.");
4022   eol_mnemonic_undecided = ':';
4023
4024   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4025     "Non-nil means ISO 2022 encoder/decoder do character unification.");
4026   Venable_character_unification = Qt;
4027
4028   DEFVAR_LISP ("standard-character-unification-table-for-decode",
4029     &Vstandard_character_unification_table_for_decode,
4030     "Table for unifying characters when reading.");
4031   Vstandard_character_unification_table_for_decode = Qnil;
4032
4033   DEFVAR_LISP ("standard-character-unification-table-for-encode",
4034     &Vstandard_character_unification_table_for_encode,
4035     "Table for unifying characters when writing.");
4036   Vstandard_character_unification_table_for_encode = Qnil;
4037
4038   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4039     "Alist of charsets vs revision numbers.\n\
4040 While encoding, if a charset (car part of an element) is found,\n\
4041 designate it with the escape sequence identifing revision (cdr part of the element).");
4042   Vcharset_revision_alist = Qnil;
4043
4044   DEFVAR_LISP ("default-process-coding-system",
4045                &Vdefault_process_coding_system,
4046     "Cons of coding systems used for process I/O by default.\n\
4047 The car part is used for decoding a process output,\n\
4048 the cdr part is used for encoding a text to be sent to a process.");
4049   Vdefault_process_coding_system = Qnil;
4050
4051   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4052     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4053 This is a vector of length 256.\n\
4054 If Nth element is non-nil, the existence of code N in a file\n\
4055 (or output of subprocess) doesn't prevent it to be detected as\n\
4056 a coding system of ISO 2022 variant which has a flag\n\
4057 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4058 or reading output of a subprocess.\n\
4059 Only 128th through 159th elements has a meaning.");
4060   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4061 }
4062
4063 #endif /* emacs */