src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in the section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and such coding
  53   systems used in Internet communication as ISO-2022-JP are all
  54   variants of ISO2022.  Details are described in the section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   the section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in the section 4.  In this file, when written as "BIG5"
  67   (all uppercase), it means the coding system, and when written as
  68   "Big5" (capitalized), it means the character set.
  69
  70   4. Else
  71
  72   If a user want to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represent a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See the section 6 for more
  81   detail.
  82
  83 */
  84
  85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  86
  87   How end-of-line of a text is encoded depends on a system.  For
  88   instance, Unix's format is just one byte of `line-feed' code,
  89   whereas DOS's format is two bytes sequence of `carriage-return' and
  90   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  91
  92   Since how characters in a text is encoded and how end-of-line is
  93   encoded is independent, any coding system described above can take
  94   any format of end-of-line.  So, Emacs has information of format of
  95   end-of-line in each coding-system.  See the section 6 for more
  96   detail.
  97
  98 */
  99
 100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 101
 102   These functions check if a text between SRC and SRC_END is encoded
 103   in the coding system category XXX.  Each returns an integer value in
 104   which appropriate flag bits for the category XXX is set.  The flag
 105   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 106   template of these functions.  */
 107 #if 0
 108 int
 109 detect_coding_emacs_mule (src, src_end)
 110      unsigned char *src, *src_end;
 111 {
 112   ...
 113 }
 114 #endif
 115
 116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 117
 118   These functions decode SRC_BYTES length text at SOURCE encoded in
 119   CODING to Emacs' internal format (emacs-mule).  The resulting text
 120   goes to a place pointed by DESTINATION, the length of which should
 121   not exceed DST_BYTES.  The bytes actually processed is returned as
 122   *CONSUMED.  The return value is the length of the decoded text.
 123   Below is a template of these functions.  */
 124 #if 0
 125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 126      struct coding_system *coding;
 127      unsigned char *source, *destination;
 128      int src_bytes, dst_bytes;
 129      int *consumed;
 130 {
 131   ...
 132 }
 133 #endif
 134
 135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 136
 137   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 138   internal format (emacs-mule) to CODING.  The resulting text goes to
 139   a place pointed by DESTINATION, the length of which should not
 140   exceed DST_BYTES.  The bytes actually processed is returned as
 141   *CONSUMED.  The return value is the length of the encoded text.
 142   Below is a template of these functions.  */
 143 #if 0
 144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148      int *consumed;
 149 {
 150   ...
 151 }
 152 #endif
 153
 154 /*** COMMONLY USED MACROS ***/
 155
 156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 157    THREE_MORE_BYTES safely get one, two, and three bytes from the
 158    source text respectively.  If there are not enough bytes in the
 159    source, they jump to `label_end_of_loop'.  The caller should set
 160    variables `src' and `src_end' to appropriate areas in advance.  */
 161
 162 #define ONE_MORE_BYTE(c1)       \
 163   do {                          \
 164     if (src < src_end)          \
 165       c1 = *src++;              \
 166     else                        \
 167       goto label_end_of_loop;   \
 168   } while (0)
 169
 170 #define TWO_MORE_BYTES(c1, c2)  \
 171   do {                          \
 172     if (src + 1 < src_end)      \
 173       c1 = *src++, c2 = *src++; \
 174     else                        \
 175       goto label_end_of_loop;   \
 176   } while (0)
 177
 178 #define THREE_MORE_BYTES(c1, c2, c3)            \
 179   do {                                          \
 180     if (src + 2 < src_end)                      \
 181       c1 = *src++, c2 = *src++, c3 = *src++;    \
 182     else                                        \
 183       goto label_end_of_loop;                   \
 184   } while (0)
 185
 186 /* The following three macros DECODE_CHARACTER_ASCII,
 187    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 188    the multi-byte form of a character of each class at the place
 189    pointed by `dst'.  The caller should set the variable `dst' to
 190    point to an appropriate area and the variable `coding' to point to
 191    the coding-system of the currently decoding text in advance.  */
 192
 193 /* Decode one ASCII character C.  */
 194
 195 #define DECODE_CHARACTER_ASCII(c)                               \
 196   do {                                                          \
 197     if (COMPOSING_P (coding->composing))                        \
 198       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 199     else                                                        \
 200       *dst++ = (c);                                             \
 201   } while (0)
 202
 203 /* Decode one DIMENSION1 character of which charset is CHARSET and
 204    position-code is C.  */
 205
 206 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 207   do {                                                                  \
 208     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 209     if (COMPOSING_P (coding->composing))                                \
 210       *dst++ = leading_code + 0x20;                                     \
 211     else                                                                \
 212       *dst++ = leading_code;                                            \
 213     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 214       *dst++ = leading_code;                                            \
 215     *dst++ = (c) | 0x80;                                                \
 216   } while (0)
 217
 218 /* Decode one DIMENSION2 character of which charset is CHARSET and
 219    position-codes are C1 and C2.  */
 220
 221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 222   do {                                                  \
 223     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 224     *dst++ = (c2) | 0x80;                               \
 225   } while (0)
 226
 227 \f
 228 /*** 1. Preamble ***/
 229
 230 #include <stdio.h>
 231
 232 #ifdef emacs
 233
 234 #include <config.h>
 235 #include "lisp.h"
 236 #include "buffer.h"
 237 #include "charset.h"
 238 #include "ccl.h"
 239 #include "coding.h"
 240 #include "window.h"
 241
 242 #else  /* not emacs */
 243
 244 #include "mulelib.h"
 245
 246 #endif /* not emacs */
 247
 248 Lisp_Object Qcoding_system, Qeol_type;
 249 Lisp_Object Qbuffer_file_coding_system;
 250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 251
 252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 254 Lisp_Object Qstart_process, Qopen_network_stream;
 255 Lisp_Object Qtarget_idx;
 256
 257 /* Mnemonic character of each format of end-of-line.  */
 258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 259 /* Mnemonic character to indicate format of end-of-line is not yet
 260    decided.  */
 261 int eol_mnemonic_undecided;
 262
 263 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 264    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 265 int system_eol_type;
 266
 267 #ifdef emacs
 268
 269 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 270
 271 /* Coding system emacs-mule is for converting only end-of-line format.  */
 272 Lisp_Object Qemacs_mule;
 273
 274 /* Coding-systems are handed between Emacs Lisp programs and C internal
 275    routines by the following three variables.  */
 276 /* Coding-system for reading files and receiving data from process.  */
 277 Lisp_Object Vcoding_system_for_read;
 278 /* Coding-system for writing files and sending data to process.  */
 279 Lisp_Object Vcoding_system_for_write;
 280 /* Coding-system actually used in the latest I/O.  */
 281 Lisp_Object Vlast_coding_system_used;
 282
 283 /* Flag to inhibit code conversion of end-of-line format.  */
 284 int inhibit_eol_conversion;
 285
 286 /* Coding-system of what terminal accept for displaying.  */
 287 struct coding_system terminal_coding;
 288
 289 /* Coding-system of what is sent from terminal keyboard.  */
 290 struct coding_system keyboard_coding;
 291
 292 Lisp_Object Vfile_coding_system_alist;
 293 Lisp_Object Vprocess_coding_system_alist;
 294 Lisp_Object Vnetwork_coding_system_alist;
 295
 296 #endif /* emacs */
 297
 298 Lisp_Object Qcoding_category_index;
 299
 300 /* List of symbols `coding-category-xxx' ordered by priority.  */
 301 Lisp_Object Vcoding_category_list;
 302
 303 /* Table of coding-systems currently assigned to each coding-category.  */
 304 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 305
 306 /* Table of names of symbol for each coding-category.  */
 307 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 308   "coding-category-emacs-mule",
 309   "coding-category-sjis",
 310   "coding-category-iso-7",
 311   "coding-category-iso-8-1",
 312   "coding-category-iso-8-2",
 313   "coding-category-iso-else",
 314   "coding-category-big5",
 315   "coding-category-binary"
 316 };
 317
 318 /* Flag to tell if we look up unification table on character code
 319    conversion.  */
 320 Lisp_Object Venable_character_unification;
 321 /* Standard unification table to look up on decoding (reading).  */
 322 Lisp_Object Vstandard_character_unification_table_for_decode;
 323 /* Standard unification table to look up on encoding (writing).  */
 324 Lisp_Object Vstandard_character_unification_table_for_encode;
 325
 326 Lisp_Object Qcharacter_unification_table;
 327 Lisp_Object Qcharacter_unification_table_for_decode;
 328 Lisp_Object Qcharacter_unification_table_for_encode;
 329
 330 /* Alist of charsets vs revision number.  */
 331 Lisp_Object Vcharset_revision_alist;
 332
 333 /* Default coding systems used for process I/O.  */
 334 Lisp_Object Vdefault_process_coding_system;
 335
 336 \f
 337 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 338
 339 /* Emacs' internal format for encoding multiple character sets is a
 340    kind of multi-byte encoding, i.e. encoding a character by a sequence
 341    of one-byte codes of variable length.  ASCII characters and control
 342    characters (e.g. `tab', `newline') are represented by one-byte as
 343    is.  It takes the range 0x00 through 0x7F.  The other characters
 344    are represented by a sequence of `base leading-code', optional
 345    `extended leading-code', and one or two `position-code's.  Length
 346    of the sequence is decided by the base leading-code.  Leading-code
 347    takes the range 0x80 through 0x9F, whereas extended leading-code
 348    and position-code take the range 0xA0 through 0xFF.  See the
 349    document of `charset.h' for more detail about leading-code and
 350    position-code.
 351
 352    There's one exception in this rule.  Special leading-code
 353    `leading-code-composition' denotes that the following several
 354    characters should be composed into one character.  Leading-codes of
 355    components (except for ASCII) are added 0x20.  An ASCII character
 356    component is represented by a 2-byte sequence of `0xA0' and
 357    `ASCII-code + 0x80'.  See also the document in `charset.h' for the
 358    detail of composite character.  Hence, we can summarize the code
 359    range as follows:
 360
 361    --- CODE RANGE of Emacs' internal format ---
 362    (character set)      (range)
 363    ASCII                0x00 .. 0x7F
 364    ELSE (1st byte)      0x80 .. 0x9F
 365         (rest bytes)    0xA0 .. 0xFF
 366    ---------------------------------------------
 367
 368   */
 369
 370 enum emacs_code_class_type emacs_code_class[256];
 371
 372 /* Go to the next statement only if *SRC is accessible and the code is
 373    greater than 0xA0.  */
 374 #define CHECK_CODE_RANGE_A0_FF  \
 375   do {                          \
 376     if (src >= src_end)         \
 377       goto label_end_of_switch; \
 378     else if (*src++ < 0xA0)     \
 379       return 0;                 \
 380   } while (0)
 381
 382 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 383    Check if a text is encoded in Emacs' internal format.  If it is,
 384    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 385
 386 int
 387 detect_coding_emacs_mule (src, src_end)
 388      unsigned char *src, *src_end;
 389 {
 390   unsigned char c;
 391   int composing = 0;
 392
 393   while (src < src_end)
 394     {
 395       c = *src++;
 396
 397       if (composing)
 398         {
 399           if (c < 0xA0)
 400             composing = 0;
 401           else
 402             c -= 0x20;
 403         }
 404
 405       switch (emacs_code_class[c])
 406         {
 407         case EMACS_ascii_code:
 408         case EMACS_linefeed_code:
 409           break;
 410
 411         case EMACS_control_code:
 412           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 413             return 0;
 414           break;
 415
 416         case EMACS_invalid_code:
 417           return 0;
 418
 419         case EMACS_leading_code_composition: /* c == 0x80 */
 420           if (composing)
 421             CHECK_CODE_RANGE_A0_FF;
 422           else
 423             composing = 1;
 424           break;
 425
 426         case EMACS_leading_code_4:
 427           CHECK_CODE_RANGE_A0_FF;
 428           /* fall down to check it two more times ...  */
 429
 430         case EMACS_leading_code_3:
 431           CHECK_CODE_RANGE_A0_FF;
 432           /* fall down to check it one more time ...  */
 433
 434         case EMACS_leading_code_2:
 435           CHECK_CODE_RANGE_A0_FF;
 436           break;
 437
 438         default:
 439         label_end_of_switch:
 440           break;
 441         }
 442     }
 443   return CODING_CATEGORY_MASK_EMACS_MULE;
 444 }
 445
 446 \f
 447 /*** 3. ISO2022 handlers ***/
 448
 449 /* The following note describes the coding system ISO2022 briefly.
 450    Since the intension of this note is to help understanding of the
 451    programs in this file, some parts are NOT ACCURATE or OVERLY
 452    SIMPLIFIED.  For the thorough understanding, please refer to the
 453    original document of ISO2022.
 454
 455    ISO2022 provides many mechanisms to encode several character sets
 456    in 7-bit and 8-bit environment.  If one choose 7-bite environment,
 457    all text is encoded by codes of less than 128.  This may make the
 458    encoded text a little bit longer, but the text get more stability
 459    to pass through several gateways (some of them split MSB off).
 460
 461    There are two kind of character set: control character set and
 462    graphic character set.  The former contains control characters such
 463    as `newline' and `escape' to provide control functions (control
 464    functions are provided also by escape sequence).  The latter
 465    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 466    two control character sets and many graphic character sets.
 467
 468    Graphic character sets are classified into one of the following
 469    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 470    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 471    bytes (DIMENSION) and the number of characters in one dimension
 472    (CHARS) of the set.  In addition, each character set is assigned an
 473    identification tag (called "final character" and denoted as <F>
 474    here after) which is unique in each class.  <F> of each character
 475    set is decided by ECMA(*) when it is registered in ISO.  Code range
 476    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 477
 478    Note (*): ECMA = European Computer Manufacturers Association
 479
 480    Here are examples of graphic character set [NAME(<F>)]:
 481         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 482         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 483         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 484         o DIMENSION2_CHARS96 -- none for the moment
 485
 486    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 487         C0 [0x00..0x1F] -- control character plane 0
 488         GL [0x20..0x7F] -- graphic character plane 0
 489         C1 [0x80..0x9F] -- control character plane 1
 490         GR [0xA0..0xFF] -- graphic character plane 1
 491
 492    A control character set is directly designated and invoked to C0 or
 493    C1 by an escape sequence.  The most common case is that ISO646's
 494    control character set is designated/invoked to C0 and ISO6429's
 495    control character set is designated/invoked to C1, and usually
 496    these designations/invocations are omitted in a coded text.  With
 497    7-bit environment, only C0 can be used, and a control character for
 498    C1 is encoded by an appropriate escape sequence to fit in the
 499    environment.  All control characters for C1 are defined the
 500    corresponding escape sequences.
 501
 502    A graphic character set is at first designated to one of four
 503    graphic registers (G0 through G3), then these graphic registers are
 504    invoked to GL or GR.  These designations and invocations can be
 505    done independently.  The most common case is that G0 is invoked to
 506    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 507    these invocations and designations are omitted in a coded text.
 508    With 7-bit environment, only GL can be used.
 509
 510    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 511    and 0x7F of GL area work as control characters SPACE and DEL
 512    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 513
 514    There are two ways of invocation: locking-shift and single-shift.
 515    With locking-shift, the invocation lasts until the next different
 516    invocation, whereas with single-shift, the invocation works only
 517    for the following character and doesn't affect locking-shift.
 518    Invocations are done by the following control characters or escape
 519    sequences.
 520
 521    ----------------------------------------------------------------------
 522    function             control char    escape sequence description
 523    ----------------------------------------------------------------------
 524    SI  (shift-in)               0x0F    none            invoke G0 to GL
 525    SI  (shift-out)              0x0E    none            invoke G1 to GL
 526    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 527    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 528    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 529    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 530    ----------------------------------------------------------------------
 531    The first four are for locking-shift.  Control characters for these
 532    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 533
 534    Designations are done by the following escape sequences.
 535    ----------------------------------------------------------------------
 536    escape sequence      description
 537    ----------------------------------------------------------------------
 538    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 539    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 540    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 541    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 542    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 543    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 544    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 545    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 546    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 547    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 548    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 549    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 550    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 551    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 552    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 553    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 554    ----------------------------------------------------------------------
 555
 556    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 557    of dimension 1, chars 94, and final character <F>, and etc.
 558
 559    Note (*): Although these designations are not allowed in ISO2022,
 560    Emacs accepts them on decoding, and produces them on encoding
 561    CHARS96 character set in a coding system which is characterized as
 562    7-bit environment, non-locking-shift, and non-single-shift.
 563
 564    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 565    '(' can be omitted.  We call this as "short-form" here after.
 566
 567    Now you may notice that there are a lot of ways for encoding the
 568    same multilingual text in ISO2022.  Actually, there exist many
 569    coding systems such as Compound Text (used in X's inter client
 570    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 571    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 572    localized platforms), and all of these are variants of ISO2022.
 573
 574    In addition to the above, Emacs handles two more kinds of escape
 575    sequences: ISO6429's direction specification and Emacs' private
 576    sequence for specifying character composition.
 577
 578    ISO6429's direction specification takes the following format:
 579         o CSI ']'      -- end of the current direction
 580         o CSI '0' ']'  -- end of the current direction
 581         o CSI '1' ']'  -- start of left-to-right text
 582         o CSI '2' ']'  -- start of right-to-left text
 583    The control character CSI (0x9B: control sequence introducer) is
 584    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 585
 586    Character composition specification takes the following format:
 587         o ESC '0' -- start character composition
 588         o ESC '1' -- end character composition
 589    Since these are not standard escape sequences of any ISO, the use
 590    of them for these meaning is restricted to Emacs only.  */
 591
 592 enum iso_code_class_type iso_code_class[256];
 593
 594 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 595    Check if a text is encoded in ISO2022.  If it is, returns an
 596    integer in which appropriate flag bits any of:
 597         CODING_CATEGORY_MASK_ISO_7
 598         CODING_CATEGORY_MASK_ISO_8_1
 599         CODING_CATEGORY_MASK_ISO_8_2
 600         CODING_CATEGORY_MASK_ISO_ELSE
 601    are set.  If a code which should never appear in ISO2022 is found,
 602    returns 0.  */
 603
 604 int
 605 detect_coding_iso2022 (src, src_end)
 606      unsigned char *src, *src_end;
 607 {
 608   int mask = (CODING_CATEGORY_MASK_ISO_7
 609               | CODING_CATEGORY_MASK_ISO_8_1
 610               | CODING_CATEGORY_MASK_ISO_8_2
 611               | CODING_CATEGORY_MASK_ISO_ELSE);
 612   int g1 = 0;                   /* 1 iff designating to G1.  */
 613   int c, i;
 614
 615   while (src < src_end)
 616     {
 617       c = *src++;
 618       switch (c)
 619         {
 620         case ISO_CODE_ESC:
 621           if (src >= src_end)
 622             break;
 623           c = *src++;
 624           if (src < src_end
 625               && ((c >= '(' && c <= '/')
 626                   || c == '$' && ((*src >= '(' && *src <= '/')
 627                                   || (*src >= '@' && *src <= 'B'))))
 628             {
 629               /* Valid designation sequence.  */
 630               if (c == ')' || (c == '$' && *src == ')'))
 631                 {
 632                   g1 = 1;
 633                   mask &= ~CODING_CATEGORY_MASK_ISO_7;
 634                 }
 635               src++;
 636               break;
 637             }
 638           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 639             return CODING_CATEGORY_MASK_ISO_ELSE;
 640           break;
 641
 642         case ISO_CODE_SO:
 643           if (g1)
 644             return CODING_CATEGORY_MASK_ISO_ELSE;
 645           break;
 646
 647         case ISO_CODE_CSI:
 648         case ISO_CODE_SS2:
 649         case ISO_CODE_SS3:
 650           mask &= ~CODING_CATEGORY_MASK_ISO_7;
 651           break;
 652
 653         default:
 654           if (c < 0x80)
 655             break;
 656           else if (c < 0xA0)
 657             return 0;
 658           else
 659             {
 660               int count = 1;
 661
 662               mask &= ~CODING_CATEGORY_MASK_ISO_7;
 663               while (src < src_end && *src >= 0xA0)
 664                 count++, src++;
 665               if (count & 1 && src < src_end)
 666                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 667             }
 668           break;
 669         }
 670     }
 671
 672   return mask;
 673 }
 674
 675 /* Decode a character of which charset is CHARSET and the 1st position
 676    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 677    fetched from SRC and set to C2.  If CHARSET is negative, it means
 678    that we are decoding ill formed text, and what we can do is just to
 679    read C1 as is.  */
 680
 681 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 682   do {                                                                  \
 683     int c_alt, charset_alt = (charset);                                 \
 684     if (COMPOSING_HEAD_P (coding->composing))                           \
 685       {                                                                 \
 686         *dst++ = LEADING_CODE_COMPOSITION;                              \
 687         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 688           /* To tell composition rules are embeded.  */                 \
 689           *dst++ = 0xFF;                                                \
 690         coding->composing += 2;                                         \
 691       }                                                                 \
 692     if ((charset) >= 0)                                                 \
 693       {                                                                 \
 694         if (CHARSET_DIMENSION (charset) == 2)                           \
 695           ONE_MORE_BYTE (c2);                                           \
 696         if (!NILP (unification_table)                                   \
 697             && ((c_alt = unify_char (unification_table,                 \
 698                                      -1, (charset), c1, c2)) >= 0))     \
 699           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 700       }                                                                 \
 701     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 702       DECODE_CHARACTER_ASCII (c1);                                      \
 703     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 704       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 705     else                                                                \
 706       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 707     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 708       /* To tell a composition rule follows.  */                        \
 709       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 710   } while (0)
 711
 712 /* Set designation state into CODING.  */
 713 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 714   do {                                                                  \
 715     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 716                                      make_number (chars),               \
 717                                      make_number (final_char));         \
 718     if (charset >= 0)                                                   \
 719       {                                                                 \
 720         if (coding->direction == 1                                      \
 721             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 722           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 723         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 724       }                                                                 \
 725   } while (0)
 726
 727 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 728
 729 int
 730 decode_coding_iso2022 (coding, source, destination,
 731                        src_bytes, dst_bytes, consumed)
 732      struct coding_system *coding;
 733      unsigned char *source, *destination;
 734      int src_bytes, dst_bytes;
 735      int *consumed;
 736 {
 737   unsigned char *src = source;
 738   unsigned char *src_end = source + src_bytes;
 739   unsigned char *dst = destination;
 740   unsigned char *dst_end = destination + dst_bytes;
 741   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 742      from DST_END to assure that overflow checking is necessary only
 743      at the head of loop.  */
 744   unsigned char *adjusted_dst_end = dst_end - 6;
 745   int charset;
 746   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 747   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 748   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 749   Lisp_Object unification_table
 750       = coding->character_unification_table_for_decode;
 751
 752   if (!NILP (Venable_character_unification) && NILP (unification_table))
 753     unification_table = Vstandard_character_unification_table_for_decode;
 754
 755   while (src < src_end && dst < adjusted_dst_end)
 756     {
 757       /* SRC_BASE remembers the start position in source in each loop.
 758          The loop will be exited when there's not enough source text
 759          to analyze long escape sequence or 2-byte code (within macros
 760          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 761          to SRC_BASE before exiting.  */
 762       unsigned char *src_base = src;
 763       int c1 = *src++, c2;
 764
 765       switch (iso_code_class [c1])
 766         {
 767         case ISO_0x20_or_0x7F:
 768           if (!coding->composing
 769               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 770             {
 771               /* This is SPACE or DEL.  */
 772               *dst++ = c1;
 773               break;
 774             }
 775           /* This is a graphic character, we fall down ...  */
 776
 777         case ISO_graphic_plane_0:
 778           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 779             {
 780               /* This is a composition rule.  */
 781               *dst++ = c1 | 0x80;
 782               coding->composing = COMPOSING_WITH_RULE_TAIL;
 783             }
 784           else
 785             DECODE_ISO_CHARACTER (charset0, c1);
 786           break;
 787
 788         case ISO_0xA0_or_0xFF:
 789           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 790             {
 791               /* Invalid code.  */
 792               *dst++ = c1;
 793               break;
 794             }
 795           /* This is a graphic character, we fall down ... */
 796
 797         case ISO_graphic_plane_1:
 798           DECODE_ISO_CHARACTER (charset1, c1);
 799           break;
 800
 801         case ISO_control_code:
 802           /* All ISO2022 control characters in this class have the
 803              same representation in Emacs internal format.  */
 804           *dst++ = c1;
 805           break;
 806
 807         case ISO_carriage_return:
 808           if (coding->eol_type == CODING_EOL_CR)
 809             {
 810               *dst++ = '\n';
 811             }
 812           else if (coding->eol_type == CODING_EOL_CRLF)
 813             {
 814               ONE_MORE_BYTE (c1);
 815               if (c1 == ISO_CODE_LF)
 816                 *dst++ = '\n';
 817               else
 818                 {
 819                   src--;
 820                   *dst++ = c1;
 821                 }
 822             }
 823           else
 824             {
 825               *dst++ = c1;
 826             }
 827           break;
 828
 829         case ISO_shift_out:
 830           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 831             goto label_invalid_escape_sequence;
 832           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 833           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 834           break;
 835
 836         case ISO_shift_in:
 837           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 838           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 839           break;
 840
 841         case ISO_single_shift_2_7:
 842         case ISO_single_shift_2:
 843           /* SS2 is handled as an escape sequence of ESC 'N' */
 844           c1 = 'N';
 845           goto label_escape_sequence;
 846
 847         case ISO_single_shift_3:
 848           /* SS2 is handled as an escape sequence of ESC 'O' */
 849           c1 = 'O';
 850           goto label_escape_sequence;
 851
 852         case ISO_control_sequence_introducer:
 853           /* CSI is handled as an escape sequence of ESC '[' ...  */
 854           c1 = '[';
 855           goto label_escape_sequence;
 856
 857         case ISO_escape:
 858           ONE_MORE_BYTE (c1);
 859         label_escape_sequence:
 860           /* Escape sequences handled by Emacs are invocation,
 861              designation, direction specification, and character
 862              composition specification.  */
 863           switch (c1)
 864             {
 865             case '&':           /* revision of following character set */
 866               ONE_MORE_BYTE (c1);
 867               if (!(c1 >= '@' && c1 <= '~'))
 868                 goto label_invalid_escape_sequence;
 869               ONE_MORE_BYTE (c1);
 870               if (c1 != ISO_CODE_ESC)
 871                 goto label_invalid_escape_sequence;
 872               ONE_MORE_BYTE (c1);
 873               goto label_escape_sequence;
 874
 875             case '$':           /* designation of 2-byte character set */
 876               ONE_MORE_BYTE (c1);
 877               if (c1 >= '@' && c1 <= 'B')
 878                 {       /* designation of JISX0208.1978, GB2312.1980,
 879                                    or JISX0208.1980 */
 880                   DECODE_DESIGNATION (0, 2, 94, c1);
 881                 }
 882               else if (c1 >= 0x28 && c1 <= 0x2B)
 883                 {       /* designation of DIMENSION2_CHARS94 character set */
 884                   ONE_MORE_BYTE (c2);
 885                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 886                 }
 887               else if (c1 >= 0x2C && c1 <= 0x2F)
 888                 {       /* designation of DIMENSION2_CHARS96 character set */
 889                   ONE_MORE_BYTE (c2);
 890                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 891                 }
 892               else
 893                 goto label_invalid_escape_sequence;
 894               break;
 895
 896             case 'n':           /* invocation of locking-shift-2 */
 897               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 898                 goto label_invalid_escape_sequence;
 899               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 900               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 901               break;
 902
 903             case 'o':           /* invocation of locking-shift-3 */
 904               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 905                 goto label_invalid_escape_sequence;
 906               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 907               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 908               break;
 909
 910             case 'N':           /* invocation of single-shift-2 */
 911               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 912                 goto label_invalid_escape_sequence;
 913               ONE_MORE_BYTE (c1);
 914               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 915               DECODE_ISO_CHARACTER (charset, c1);
 916               break;
 917
 918             case 'O':           /* invocation of single-shift-3 */
 919               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 920                 goto label_invalid_escape_sequence;
 921               ONE_MORE_BYTE (c1);
 922               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 923               DECODE_ISO_CHARACTER (charset, c1);
 924               break;
 925
 926             case '0':           /* start composing without embeded rules */
 927               coding->composing = COMPOSING_NO_RULE_HEAD;
 928               break;
 929
 930             case '1':           /* end composing */
 931               coding->composing = COMPOSING_NO;
 932               break;
 933
 934             case '2':           /* start composing with embeded rules */
 935               coding->composing = COMPOSING_WITH_RULE_HEAD;
 936               break;
 937
 938             case '[':           /* specification of direction */
 939               /* For the moment, nested direction is not supported.
 940                  So, the value of `coding->direction' is 0 or 1: 0
 941                  means left-to-right, 1 means right-to-left.  */
 942               ONE_MORE_BYTE (c1);
 943               switch (c1)
 944                 {
 945                 case ']':       /* end of the current direction */
 946                   coding->direction = 0;
 947
 948                 case '0':       /* end of the current direction */
 949                 case '1':       /* start of left-to-right direction */
 950                   ONE_MORE_BYTE (c1);
 951                   if (c1 == ']')
 952                     coding->direction = 0;
 953                   else
 954                     goto label_invalid_escape_sequence;
 955                   break;
 956
 957                 case '2':       /* start of right-to-left direction */
 958                   ONE_MORE_BYTE (c1);
 959                   if (c1 == ']')
 960                     coding->direction= 1;
 961                   else
 962                     goto label_invalid_escape_sequence;
 963                   break;
 964
 965                 default:
 966                   goto label_invalid_escape_sequence;
 967                 }
 968               break;
 969
 970             default:
 971               if (c1 >= 0x28 && c1 <= 0x2B)
 972                 {       /* designation of DIMENSION1_CHARS94 character set */
 973                   ONE_MORE_BYTE (c2);
 974                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
 975                 }
 976               else if (c1 >= 0x2C && c1 <= 0x2F)
 977                 {       /* designation of DIMENSION1_CHARS96 character set */
 978                   ONE_MORE_BYTE (c2);
 979                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
 980                 }
 981               else
 982                 {
 983                   goto label_invalid_escape_sequence;
 984                 }
 985             }
 986           /* We must update these variables now.  */
 987           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 988           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 989           break;
 990
 991         label_invalid_escape_sequence:
 992           {
 993             int length = src - src_base;
 994
 995             bcopy (src_base, dst, length);
 996             dst += length;
 997           }
 998         }
 999       continue;
1000
1001     label_end_of_loop:
1002       coding->carryover_size = src - src_base;
1003       bcopy (src_base, coding->carryover, coding->carryover_size);
1004       src = src_base;
1005       break;
1006     }
1007
1008   /* If this is the last block of the text to be decoded, we had
1009      better just flush out all remaining codes in the text although
1010      they are not valid characters.  */
1011   if (coding->last_block)
1012     {
1013       bcopy (src, dst, src_end - src);
1014       dst += (src_end - src);
1015       src = src_end;
1016     }
1017   *consumed = src - source;
1018   return dst - destination;
1019 }
1020
1021 /* ISO2022 encoding staffs.  */
1022
1023 /*
1024    It is not enough to say just "ISO2022" on encoding, but we have to
1025    specify more details.  In Emacs, each coding-system of ISO2022
1026    variant has the following specifications:
1027         1. Initial designation to G0 thru G3.
1028         2. Allows short-form designation?
1029         3. ASCII should be designated to G0 before control characters?
1030         4. ASCII should be designated to G0 at end of line?
1031         5. 7-bit environment or 8-bit environment?
1032         6. Use locking-shift?
1033         7. Use Single-shift?
1034    And the following two are only for Japanese:
1035         8. Use ASCII in place of JIS0201-1976-Roman?
1036         9. Use JISX0208-1983 in place of JISX0208-1978?
1037    These specifications are encoded in `coding->flags' as flag bits
1038    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1039    detail.
1040 */
1041
1042 /* Produce codes (escape sequence) for designating CHARSET to graphic
1043    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1044    the coding system CODING allows, produce designation sequence of
1045    short-form.  */
1046
1047 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1048   do {                                                                  \
1049     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1050     char *intermediate_char_94 = "()*+";                                \
1051     char *intermediate_char_96 = ",-./";                                \
1052     Lisp_Object temp                                                    \
1053       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1054     if (! NILP (temp))                                                  \
1055         {                                                               \
1056         *dst++ = ISO_CODE_ESC;                                          \
1057         *dst++ = '&';                                                   \
1058         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1059       }                                                                 \
1060     *dst++ = ISO_CODE_ESC;                                              \
1061     if (CHARSET_DIMENSION (charset) == 1)                               \
1062       {                                                                 \
1063         if (CHARSET_CHARS (charset) == 94)                              \
1064           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1065         else                                                            \
1066           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1067       }                                                                 \
1068     else                                                                \
1069       {                                                                 \
1070         *dst++ = '$';                                                   \
1071         if (CHARSET_CHARS (charset) == 94)                              \
1072           {                                                             \
1073             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1074                 || reg != 0                                             \
1075                 || final_char < '@' || final_char > 'B')                \
1076               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1077           }                                                             \
1078         else                                                            \
1079           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1080       }                                                                 \
1081     *dst++ = final_char;                                                \
1082     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1083   } while (0)
1084
1085 /* The following two macros produce codes (control character or escape
1086    sequence) for ISO2022 single-shift functions (single-shift-2 and
1087    single-shift-3).  */
1088
1089 #define ENCODE_SINGLE_SHIFT_2                           \
1090   do {                                                  \
1091     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1092       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1093     else                                                \
1094       *dst++ = ISO_CODE_SS2;                            \
1095     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1096   } while (0)
1097
1098 #define ENCODE_SINGLE_SHIFT_3                           \
1099   do {                                                  \
1100     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1101       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1102     else                                                \
1103       *dst++ = ISO_CODE_SS3;                            \
1104     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1105   } while (0)
1106
1107 /* The following four macros produce codes (control character or
1108    escape sequence) for ISO2022 locking-shift functions (shift-in,
1109    shift-out, locking-shift-2, and locking-shift-3).  */
1110
1111 #define ENCODE_SHIFT_IN                         \
1112   do {                                          \
1113     *dst++ = ISO_CODE_SI;                       \
1114     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1115   } while (0)
1116
1117 #define ENCODE_SHIFT_OUT                        \
1118   do {                                          \
1119     *dst++ = ISO_CODE_SO;                       \
1120     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1121   } while (0)
1122
1123 #define ENCODE_LOCKING_SHIFT_2                  \
1124   do {                                          \
1125     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1126     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1127   } while (0)
1128
1129 #define ENCODE_LOCKING_SHIFT_3                  \
1130   do {                                          \
1131     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1132     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1133   } while (0)
1134
1135 /* Produce codes for a DIMENSION1 character of which character set is
1136    CHARSET and position-code is C1.  Designation and invocation
1137    sequences are also produced in advance if necessary.  */
1138
1139
1140 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1141   do {                                                                  \
1142     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1143       {                                                                 \
1144         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1145           *dst++ = c1 & 0x7F;                                           \
1146         else                                                            \
1147           *dst++ = c1 | 0x80;                                           \
1148         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1149         break;                                                          \
1150       }                                                                 \
1151     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1152       {                                                                 \
1153         *dst++ = c1 & 0x7F;                                             \
1154         break;                                                          \
1155       }                                                                 \
1156     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1157       {                                                                 \
1158         *dst++ = c1 | 0x80;                                             \
1159         break;                                                          \
1160       }                                                                 \
1161     else                                                                \
1162       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1163          must invoke it, or, at first, designate it to some graphic     \
1164          register.  Then repeat the loop to actually produce the        \
1165          character.  */                                                 \
1166       dst = encode_invocation_designation (charset, coding, dst);       \
1167   } while (1)
1168
1169 /* Produce codes for a DIMENSION2 character of which character set is
1170    CHARSET and position-codes are C1 and C2.  Designation and
1171    invocation codes are also produced in advance if necessary.  */
1172
1173 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1174   do {                                                                  \
1175     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1176       {                                                                 \
1177         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1178           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1179         else                                                            \
1180           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1181         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1182         break;                                                          \
1183       }                                                                 \
1184     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1185       {                                                                 \
1186         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1187         break;                                                          \
1188       }                                                                 \
1189     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1190       {                                                                 \
1191         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1192         break;                                                          \
1193       }                                                                 \
1194     else                                                                \
1195       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1196          must invoke it, or, at first, designate it to some graphic     \
1197          register.  Then repeat the loop to actually produce the        \
1198          character.  */                                                 \
1199       dst = encode_invocation_designation (charset, coding, dst);       \
1200   } while (1)
1201
1202 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1203   do {                                                                    \
1204     int c_alt, charset_alt;                                               \
1205     if (!NILP (unification_table)                                         \
1206         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1207             >= 0))                                                        \
1208       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1209     else                                                                  \
1210       charset_alt = charset;                                              \
1211     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1212       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1213     else                                                                  \
1214       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1215   } while (0)
1216
1217 /* Produce designation and invocation codes at a place pointed by DST
1218    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1219    Return new DST.  */
1220
1221 unsigned char *
1222 encode_invocation_designation (charset, coding, dst)
1223      int charset;
1224      struct coding_system *coding;
1225      unsigned char *dst;
1226 {
1227   int reg;                      /* graphic register number */
1228
1229   /* At first, check designations.  */
1230   for (reg = 0; reg < 4; reg++)
1231     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1232       break;
1233
1234   if (reg >= 4)
1235     {
1236       /* CHARSET is not yet designated to any graphic registers.  */
1237       /* At first check the requested designation.  */
1238       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1239       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1240         /* Since CHARSET requests no special designation, designate it
1241            to graphic register 0.  */
1242         reg = 0;
1243
1244       ENCODE_DESIGNATION (charset, reg, coding);
1245     }
1246
1247   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1248       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1249     {
1250       /* Since the graphic register REG is not invoked to any graphic
1251          planes, invoke it to graphic plane 0.  */
1252       switch (reg)
1253         {
1254         case 0:                 /* graphic register 0 */
1255           ENCODE_SHIFT_IN;
1256           break;
1257
1258         case 1:                 /* graphic register 1 */
1259           ENCODE_SHIFT_OUT;
1260           break;
1261
1262         case 2:                 /* graphic register 2 */
1263           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1264             ENCODE_SINGLE_SHIFT_2;
1265           else
1266             ENCODE_LOCKING_SHIFT_2;
1267           break;
1268
1269         case 3:                 /* graphic register 3 */
1270           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1271             ENCODE_SINGLE_SHIFT_3;
1272           else
1273             ENCODE_LOCKING_SHIFT_3;
1274           break;
1275         }
1276     }
1277   return dst;
1278 }
1279
1280 /* The following two macros produce codes for indicating composition.  */
1281 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1282 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1283 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1284
1285 /* The following three macros produce codes for indicating direction
1286    of text.  */
1287 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1288   do {                                                  \
1289     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1290       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1291     else                                                \
1292       *dst++ = ISO_CODE_CSI;                            \
1293   } while (0)
1294
1295 #define ENCODE_DIRECTION_R2L    \
1296   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1297
1298 #define ENCODE_DIRECTION_L2R    \
1299   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1300
1301 /* Produce codes for designation and invocation to reset the graphic
1302    planes and registers to initial state.  */
1303 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1304   do {                                                                      \
1305     int reg;                                                                \
1306     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1307       ENCODE_SHIFT_IN;                                                      \
1308     for (reg = 0; reg < 4; reg++)                                           \
1309       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1310           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1311               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1312         ENCODE_DESIGNATION                                                  \
1313           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1314   } while (0)
1315
1316 /* Produce designation sequences of charsets in the line started from
1317    *SRC to a place pointed by DSTP.
1318
1319    If the current block ends before any end-of-line, we may fail to
1320    find all the necessary *designations.  */
1321 encode_designation_at_bol (coding, table, src, src_end, dstp)
1322      struct coding_system *coding;
1323      Lisp_Object table;
1324      unsigned char *src, *src_end, **dstp;
1325 {
1326   int charset, c, found = 0, reg;
1327   /* Table of charsets to be designated to each graphic register.  */
1328   int r[4];
1329   unsigned char *dst = *dstp;
1330
1331   for (reg = 0; reg < 4; reg++)
1332     r[reg] = -1;
1333
1334   while (src < src_end && *src != '\n' && found < 4)
1335     {
1336       int bytes = BYTES_BY_CHAR_HEAD (*src);
1337
1338       if (NILP (table))
1339         charset = CHARSET_AT (src);
1340       else
1341         {
1342           int c_alt, c1, c2;
1343
1344           SPLIT_STRING(src, bytes, charset, c1, c2);
1345           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1346             charset = CHAR_CHARSET (c_alt);
1347         }
1348
1349       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1350       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1351         {
1352           found++;
1353           r[reg] = charset;
1354         }
1355
1356       src += bytes;
1357     }
1358
1359   if (found)
1360     {
1361       for (reg = 0; reg < 4; reg++)
1362         if (r[reg] >= 0
1363             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1364           ENCODE_DESIGNATION (r[reg], reg, coding);
1365       *dstp = dst;
1366     }
1367 }
1368
1369 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1370
1371 int
1372 encode_coding_iso2022 (coding, source, destination,
1373                        src_bytes, dst_bytes, consumed)
1374      struct coding_system *coding;
1375      unsigned char *source, *destination;
1376      int src_bytes, dst_bytes;
1377      int *consumed;
1378 {
1379   unsigned char *src = source;
1380   unsigned char *src_end = source + src_bytes;
1381   unsigned char *dst = destination;
1382   unsigned char *dst_end = destination + dst_bytes;
1383   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1384      from DST_END to assure overflow checking is necessary only at the
1385      head of loop.  */
1386   unsigned char *adjusted_dst_end = dst_end - 19;
1387   Lisp_Object unification_table
1388       = coding->character_unification_table_for_encode;
1389
1390   if (!NILP (Venable_character_unification) && NILP (unification_table))
1391     unification_table = Vstandard_character_unification_table_for_encode;
1392
1393   while (src < src_end && dst < adjusted_dst_end)
1394     {
1395       /* SRC_BASE remembers the start position in source in each loop.
1396          The loop will be exited when there's not enough source text
1397          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1398          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1399          reset to SRC_BASE before exiting.  */
1400       unsigned char *src_base = src;
1401       int charset, c1, c2, c3, c4;
1402
1403       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1404           && CODING_SPEC_ISO_BOL (coding))
1405         {
1406           /* We have to produce designation sequences if any now.  */
1407           encode_designation_at_bol (coding, unification_table,
1408                                      src, src_end, &dst);
1409           CODING_SPEC_ISO_BOL (coding) = 0;
1410         }
1411
1412       c1 = *src++;
1413       /* If we are seeing a component of a composite character, we are
1414          seeing a leading-code specially encoded for composition, or a
1415          composition rule if composing with rule.  We must set C1
1416          to a normal leading-code or an ASCII code.  If we are not at
1417          a composed character, we must reset the composition state.  */
1418       if (COMPOSING_P (coding->composing))
1419         {
1420           if (c1 < 0xA0)
1421             {
1422               /* We are not in a composite character any longer.  */
1423               coding->composing = COMPOSING_NO;
1424               ENCODE_COMPOSITION_END;
1425             }
1426           else
1427             {
1428               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1429                 {
1430                   *dst++ = c1 & 0x7F;
1431                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1432                   continue;
1433                 }
1434               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1435                 coding->composing = COMPOSING_WITH_RULE_RULE;
1436               if (c1 == 0xA0)
1437                 {
1438                   /* This is an ASCII component.  */
1439                   ONE_MORE_BYTE (c1);
1440                   c1 &= 0x7F;
1441                 }
1442               else
1443                 /* This is a leading-code of non ASCII component.  */
1444                 c1 -= 0x20;
1445             }
1446         }
1447
1448       /* Now encode one character.  C1 is a control character, an
1449          ASCII character, or a leading-code of multi-byte character.  */
1450       switch (emacs_code_class[c1])
1451         {
1452         case EMACS_ascii_code:
1453           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1454           break;
1455
1456         case EMACS_control_code:
1457           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1458             ENCODE_RESET_PLANE_AND_REGISTER;
1459           *dst++ = c1;
1460           break;
1461
1462         case EMACS_carriage_return_code:
1463           if (!coding->selective)
1464             {
1465               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1466                 ENCODE_RESET_PLANE_AND_REGISTER;
1467               *dst++ = c1;
1468               break;
1469             }
1470           /* fall down to treat '\r' as '\n' ...  */
1471
1472         case EMACS_linefeed_code:
1473           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1474             ENCODE_RESET_PLANE_AND_REGISTER;
1475           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1476             bcopy (coding->spec.iso2022.initial_designation,
1477                    coding->spec.iso2022.current_designation,
1478                    sizeof coding->spec.iso2022.initial_designation);
1479           if (coding->eol_type == CODING_EOL_LF
1480               || coding->eol_type == CODING_EOL_UNDECIDED)
1481             *dst++ = ISO_CODE_LF;
1482           else if (coding->eol_type == CODING_EOL_CRLF)
1483             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1484           else
1485             *dst++ = ISO_CODE_CR;
1486           CODING_SPEC_ISO_BOL (coding) = 1;
1487           break;
1488
1489         case EMACS_leading_code_2:
1490           ONE_MORE_BYTE (c2);
1491           ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1492           break;
1493
1494         case EMACS_leading_code_3:
1495           TWO_MORE_BYTES (c2, c3);
1496           if (c1 < LEADING_CODE_PRIVATE_11)
1497             ENCODE_ISO_CHARACTER (c1, c2, c3);
1498           else
1499             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1500           break;
1501
1502         case EMACS_leading_code_4:
1503           THREE_MORE_BYTES (c2, c3, c4);
1504           ENCODE_ISO_CHARACTER (c2, c3, c4);
1505           break;
1506
1507         case EMACS_leading_code_composition:
1508           ONE_MORE_BYTE (c1);
1509           if (c1 == 0xFF)
1510             {
1511               coding->composing = COMPOSING_WITH_RULE_HEAD;
1512               ENCODE_COMPOSITION_WITH_RULE_START;
1513             }
1514           else
1515             {
1516               /* Rewind one byte because it is a character code of
1517                  composition elements.  */
1518               src--;
1519               coding->composing = COMPOSING_NO_RULE_HEAD;
1520               ENCODE_COMPOSITION_NO_RULE_START;
1521             }
1522           break;
1523
1524         case EMACS_invalid_code:
1525           *dst++ = c1;
1526           break;
1527         }
1528       continue;
1529     label_end_of_loop:
1530       coding->carryover_size = src - src_base;
1531       bcopy (src_base, coding->carryover, coding->carryover_size);
1532       break;
1533     }
1534
1535   /* If this is the last block of the text to be encoded, we must
1536      reset graphic planes and registers to the initial state.  */
1537   if (src >= src_end && coding->last_block)
1538     {
1539       ENCODE_RESET_PLANE_AND_REGISTER;
1540       if (coding->carryover_size > 0
1541           && coding->carryover_size < (dst_end - dst))
1542         {
1543           bcopy (coding->carryover, dst, coding->carryover_size);
1544           dst += coding->carryover_size;
1545           coding->carryover_size = 0;
1546         }
1547     }
1548   *consumed = src - source;
1549   return dst - destination;
1550 }
1551
1552 \f
1553 /*** 4. SJIS and BIG5 handlers ***/
1554
1555 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1556    quite widely.  So, for the moment, Emacs supports them in the bare
1557    C code.  But, in the future, they may be supported only by CCL.  */
1558
1559 /* SJIS is a coding system encoding three character sets: ASCII, right
1560    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1561    as is.  A character of charset katakana-jisx0201 is encoded by
1562    "position-code + 0x80".  A character of charset japanese-jisx0208
1563    is encoded in 2-byte but two position-codes are divided and shifted
1564    so that it fit in the range below.
1565
1566    --- CODE RANGE of SJIS ---
1567    (character set)      (range)
1568    ASCII                0x00 .. 0x7F
1569    KATAKANA-JISX0201    0xA0 .. 0xDF
1570    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1571             (2nd byte)  0x40 .. 0xFF
1572    -------------------------------
1573
1574 */
1575
1576 /* BIG5 is a coding system encoding two character sets: ASCII and
1577    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1578    character set and is encoded in two-byte.
1579
1580    --- CODE RANGE of BIG5 ---
1581    (character set)      (range)
1582    ASCII                0x00 .. 0x7F
1583    Big5 (1st byte)      0xA1 .. 0xFE
1584         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1585    --------------------------
1586
1587    Since the number of characters in Big5 is larger than maximum
1588    characters in Emacs' charset (96x96), it can't be handled as one
1589    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1590    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1591    contains frequently used characters and the latter contains less
1592    frequently used characters.  */
1593
1594 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1595    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1596    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1597    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1598
1599 /* Number of Big5 characters which have the same code in 1st byte.  */
1600 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1601
1602 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1603   do {                                                                  \
1604     unsigned int temp                                                   \
1605       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1606     if (b1 < 0xC9)                                                      \
1607       charset = charset_big5_1;                                         \
1608     else                                                                \
1609       {                                                                 \
1610         charset = charset_big5_2;                                       \
1611         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1612       }                                                                 \
1613     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1614     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1615   } while (0)
1616
1617 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1618   do {                                                                  \
1619     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1620     if (charset == charset_big5_2)                                      \
1621       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1622     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1623     b2 = temp % BIG5_SAME_ROW;                                          \
1624     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1625   } while (0)
1626
1627 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1628   do {                                                                  \
1629     int c_alt, charset_alt = (charset);                                 \
1630     if (!NILP (unification_table)                                       \
1631         && ((c_alt = unify_char (unification_table,                     \
1632                                  -1, (charset), c1, c2)) >= 0))         \
1633           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1634     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1635       DECODE_CHARACTER_ASCII (c1);                                      \
1636     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1637       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1638     else                                                                \
1639       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1640   } while (0)
1641
1642 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1643   do {                                                                    \
1644     int c_alt, charset_alt;                                               \
1645     if (!NILP (unification_table)                                         \
1646         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1647             >= 0))                                                        \
1648       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1649     else                                                                  \
1650       charset_alt = charset;                                              \
1651     if (charset_alt == charset_ascii)                                     \
1652       *dst++ = c1;                                                        \
1653     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1654       {                                                                   \
1655         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1656           *dst++ = c1;                                                    \
1657         else                                                              \
1658           *dst++ = charset_alt, *dst++ = c1;                              \
1659       }                                                                   \
1660     else                                                                  \
1661       {                                                                   \
1662         c1 &= 0x7F, c2 &= 0x7F;                                           \
1663         if (sjis_p && charset_alt == charset_jisx0208)                    \
1664           {                                                               \
1665             unsigned char s1, s2;                                         \
1666                                                                           \
1667             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1668             *dst++ = s1, *dst++ = s2;                                     \
1669           }                                                               \
1670         else if (!sjis_p                                                  \
1671                  && (charset_alt == charset_big5_1                        \
1672                      || charset_alt == charset_big5_2))                   \
1673           {                                                               \
1674             unsigned char b1, b2;                                         \
1675                                                                           \
1676             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1677             *dst++ = b1, *dst++ = b2;                                     \
1678           }                                                               \
1679         else                                                              \
1680           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1681       }                                                                   \
1682   } while (0);
1683
1684 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1685    Check if a text is encoded in SJIS.  If it is, return
1686    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1687
1688 int
1689 detect_coding_sjis (src, src_end)
1690      unsigned char *src, *src_end;
1691 {
1692   unsigned char c;
1693
1694   while (src < src_end)
1695     {
1696       c = *src++;
1697       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1698         return 0;
1699       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1700         {
1701           if (src < src_end && *src++ < 0x40)
1702             return 0;
1703         }
1704     }
1705   return CODING_CATEGORY_MASK_SJIS;
1706 }
1707
1708 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1709    Check if a text is encoded in BIG5.  If it is, return
1710    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1711
1712 int
1713 detect_coding_big5 (src, src_end)
1714      unsigned char *src, *src_end;
1715 {
1716   unsigned char c;
1717
1718   while (src < src_end)
1719     {
1720       c = *src++;
1721       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1722         return 0;
1723       if (c >= 0xA1)
1724         {
1725           if (src >= src_end)
1726             break;
1727           c = *src++;
1728           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1729             return 0;
1730         }
1731     }
1732   return CODING_CATEGORY_MASK_BIG5;
1733 }
1734
1735 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1736    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1737
1738 int
1739 decode_coding_sjis_big5 (coding, source, destination,
1740                          src_bytes, dst_bytes, consumed, sjis_p)
1741      struct coding_system *coding;
1742      unsigned char *source, *destination;
1743      int src_bytes, dst_bytes;
1744      int *consumed;
1745      int sjis_p;
1746 {
1747   unsigned char *src = source;
1748   unsigned char *src_end = source + src_bytes;
1749   unsigned char *dst = destination;
1750   unsigned char *dst_end = destination + dst_bytes;
1751   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1752      from DST_END to assure overflow checking is necessary only at the
1753      head of loop.  */
1754   unsigned char *adjusted_dst_end = dst_end - 3;
1755   Lisp_Object unification_table
1756       = coding->character_unification_table_for_decode;
1757
1758   if (!NILP (Venable_character_unification) && NILP (unification_table))
1759     unification_table = Vstandard_character_unification_table_for_decode;
1760
1761   while (src < src_end && dst < adjusted_dst_end)
1762     {
1763       /* SRC_BASE remembers the start position in source in each loop.
1764          The loop will be exited when there's not enough source text
1765          to analyze two-byte character (within macro ONE_MORE_BYTE).
1766          In that case, SRC is reset to SRC_BASE before exiting.  */
1767       unsigned char *src_base = src;
1768       unsigned char c1 = *src++, c2, c3, c4;
1769
1770       if (c1 == '\r')
1771         {
1772           if (coding->eol_type == CODING_EOL_CRLF)
1773             {
1774               ONE_MORE_BYTE (c2);
1775               if (c2 == '\n')
1776                 *dst++ = c2;
1777               else
1778                 /* To process C2 again, SRC is subtracted by 1.  */
1779                 *dst++ = c1, src--;
1780             }
1781           else
1782             *dst++ = c1;
1783         }
1784       else if (c1 < 0x20)
1785         *dst++ = c1;
1786       else if (c1 < 0x80)
1787         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1788       else if (c1 < 0xA0 || c1 >= 0xE0)
1789         {
1790           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1791           if (sjis_p)
1792             {
1793               ONE_MORE_BYTE (c2);
1794               DECODE_SJIS (c1, c2, c3, c4);
1795               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1796             }
1797           else if (c1 >= 0xE0 && c1 < 0xFF)
1798             {
1799               int charset;
1800
1801               ONE_MORE_BYTE (c2);
1802               DECODE_BIG5 (c1, c2, charset, c3, c4);
1803               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1804             }
1805           else                  /* Invalid code */
1806             *dst++ = c1;
1807         }
1808       else
1809         {
1810           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1811           if (sjis_p)
1812             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1813           else
1814             {
1815               int charset;
1816
1817               ONE_MORE_BYTE (c2);
1818               DECODE_BIG5 (c1, c2, charset, c3, c4);
1819               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1820             }
1821         }
1822       continue;
1823
1824     label_end_of_loop:
1825       coding->carryover_size = src - src_base;
1826       bcopy (src_base, coding->carryover, coding->carryover_size);
1827       src = src_base;
1828       break;
1829     }
1830
1831   *consumed = src - source;
1832   return dst - destination;
1833 }
1834
1835 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1836    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1837    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1838    sure that all these charsets are registered as official charset
1839    (i.e. do not have extended leading-codes).  Characters of other
1840    charsets are produced without any encoding.  If SJIS_P is 1, encode
1841    SJIS text, else encode BIG5 text.  */
1842
1843 int
1844 encode_coding_sjis_big5 (coding, source, destination,
1845                          src_bytes, dst_bytes, consumed, sjis_p)
1846      struct coding_system *coding;
1847      unsigned char *source, *destination;
1848      int src_bytes, dst_bytes;
1849      int *consumed;
1850      int sjis_p;
1851 {
1852   unsigned char *src = source;
1853   unsigned char *src_end = source + src_bytes;
1854   unsigned char *dst = destination;
1855   unsigned char *dst_end = destination + dst_bytes;
1856   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1857      from DST_END to assure overflow checking is necessary only at the
1858      head of loop.  */
1859   unsigned char *adjusted_dst_end = dst_end - 1;
1860   Lisp_Object unification_table
1861       = coding->character_unification_table_for_encode;
1862
1863   if (!NILP (Venable_character_unification) && NILP (unification_table))
1864     unification_table = Vstandard_character_unification_table_for_encode;
1865
1866   while (src < src_end && dst < adjusted_dst_end)
1867     {
1868       /* SRC_BASE remembers the start position in source in each loop.
1869          The loop will be exited when there's not enough source text
1870          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1871          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1872          before exiting.  */
1873       unsigned char *src_base = src;
1874       unsigned char c1 = *src++, c2, c3, c4;
1875
1876       if (coding->composing)
1877         {
1878           if (c1 == 0xA0)
1879             {
1880               ONE_MORE_BYTE (c1);
1881               c1 &= 0x7F;
1882             }
1883           else if (c1 >= 0xA0)
1884             c1 -= 0x20;
1885           else
1886             coding->composing = 0;
1887         }
1888
1889       switch (emacs_code_class[c1])
1890         {
1891         case EMACS_ascii_code:
1892           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1893           break;
1894
1895         case EMACS_control_code:
1896           *dst++ = c1;
1897           break;
1898
1899         case EMACS_carriage_return_code:
1900           if (!coding->selective)
1901             {
1902               *dst++ = c1;
1903               break;
1904             }
1905           /* fall down to treat '\r' as '\n' ...  */
1906
1907         case EMACS_linefeed_code:
1908           if (coding->eol_type == CODING_EOL_LF
1909               || coding->eol_type == CODING_EOL_UNDECIDED)
1910             *dst++ = '\n';
1911           else if (coding->eol_type == CODING_EOL_CRLF)
1912             *dst++ = '\r', *dst++ = '\n';
1913           else
1914             *dst++ = '\r';
1915           break;
1916
1917         case EMACS_leading_code_2:
1918           ONE_MORE_BYTE (c2);
1919           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
1920           break;
1921
1922         case EMACS_leading_code_3:
1923           TWO_MORE_BYTES (c2, c3);
1924           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
1925           break;
1926
1927         case EMACS_leading_code_4:
1928           THREE_MORE_BYTES (c2, c3, c4);
1929           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
1930           break;
1931
1932         case EMACS_leading_code_composition:
1933           coding->composing = 1;
1934           break;
1935
1936         default:                /* i.e. case EMACS_invalid_code: */
1937           *dst++ = c1;
1938         }
1939       continue;
1940
1941     label_end_of_loop:
1942       coding->carryover_size = src - src_base;
1943       bcopy (src_base, coding->carryover, coding->carryover_size);
1944       src = src_base;
1945       break;
1946     }
1947
1948   *consumed = src - source;
1949   return dst - destination;
1950 }
1951
1952 \f
1953 /*** 5. End-of-line handlers ***/
1954
1955 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1956    This function is called only when `coding->eol_type' is
1957    CODING_EOL_CRLF or CODING_EOL_CR.  */
1958
1959 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1960      struct coding_system *coding;
1961      unsigned char *source, *destination;
1962      int src_bytes, dst_bytes;
1963      int *consumed;
1964 {
1965   unsigned char *src = source;
1966   unsigned char *src_end = source + src_bytes;
1967   unsigned char *dst = destination;
1968   unsigned char *dst_end = destination + dst_bytes;
1969   int produced;
1970
1971   switch (coding->eol_type)
1972     {
1973     case CODING_EOL_CRLF:
1974       {
1975         /* Since the maximum bytes produced by each loop is 2, we
1976            subtract 1 from DST_END to assure overflow checking is
1977            necessary only at the head of loop.  */
1978         unsigned char *adjusted_dst_end = dst_end - 1;
1979
1980         while (src < src_end && dst < adjusted_dst_end)
1981           {
1982             unsigned char *src_base = src;
1983             unsigned char c = *src++;
1984             if (c == '\r')
1985               {
1986                 ONE_MORE_BYTE (c);
1987                 if (c != '\n')
1988                   *dst++ = '\r';
1989                 *dst++ = c;
1990               }
1991             else
1992               *dst++ = c;
1993             continue;
1994
1995           label_end_of_loop:
1996             coding->carryover_size = src - src_base;
1997             bcopy (src_base, coding->carryover, coding->carryover_size);
1998             src = src_base;
1999             break;
2000           }
2001         *consumed = src - source;
2002         produced = dst - destination;
2003         break;
2004       }
2005
2006     case CODING_EOL_CR:
2007       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2008       bcopy (source, destination, produced);
2009       dst_end = destination + produced;
2010       while (dst < dst_end)
2011         if (*dst++ == '\r') dst[-1] = '\n';
2012       *consumed = produced;
2013       break;
2014
2015     default:                    /* i.e. case: CODING_EOL_LF */
2016       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2017       bcopy (source, destination, produced);
2018       *consumed = produced;
2019       break;
2020     }
2021
2022   return produced;
2023 }
2024
2025 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2026    format of end-of-line according to `coding->eol_type'.  If
2027    `coding->selective' is 1, code '\r' in source text also means
2028    end-of-line.  */
2029
2030 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2031      struct coding_system *coding;
2032      unsigned char *source, *destination;
2033      int src_bytes, dst_bytes;
2034      int *consumed;
2035 {
2036   unsigned char *src = source;
2037   unsigned char *dst = destination;
2038   int produced;
2039
2040   if (src_bytes <= 0)
2041     return 0;
2042
2043   switch (coding->eol_type)
2044     {
2045     case CODING_EOL_LF:
2046     case CODING_EOL_UNDECIDED:
2047       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2048       bcopy (source, destination, produced);
2049       if (coding->selective)
2050         {
2051           int i = produced;
2052           while (i--)
2053             if (*dst++ == '\r') dst[-1] = '\n';
2054         }
2055       *consumed = produced;
2056
2057     case CODING_EOL_CRLF:
2058       {
2059         unsigned char c;
2060         unsigned char *src_end = source + src_bytes;
2061         unsigned char *dst_end = destination + dst_bytes;
2062         /* Since the maximum bytes produced by each loop is 2, we
2063            subtract 1 from DST_END to assure overflow checking is
2064            necessary only at the head of loop.  */
2065         unsigned char *adjusted_dst_end = dst_end - 1;
2066
2067         while (src < src_end && dst < adjusted_dst_end)
2068           {
2069             c = *src++;
2070             if (c == '\n' || (c == '\r' && coding->selective))
2071               *dst++ = '\r', *dst++ = '\n';
2072             else
2073               *dst++ = c;
2074           }
2075         produced = dst - destination;
2076         *consumed = src - source;
2077         break;
2078       }
2079
2080     default:                    /* i.e. case CODING_EOL_CR: */
2081       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2082       bcopy (source, destination, produced);
2083       {
2084         int i = produced;
2085         while (i--)
2086           if (*dst++ == '\n') dst[-1] = '\r';
2087       }
2088       *consumed = produced;
2089     }
2090
2091   return produced;
2092 }
2093
2094 \f
2095 /*** 6. C library functions ***/
2096
2097 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2098    has a property `coding-system'.  The value of this property is a
2099    vector of length 5 (called as coding-vector).  Among elements of
2100    this vector, the first (element[0]) and the fifth (element[4])
2101    carry important information for decoding/encoding.  Before
2102    decoding/encoding, this information should be set in fields of a
2103    structure of type `coding_system'.
2104
2105    A value of property `coding-system' can be a symbol of another
2106    subsidiary coding-system.  In that case, Emacs gets coding-vector
2107    from that symbol.
2108
2109    `element[0]' contains information to be set in `coding->type'.  The
2110    value and its meaning is as follows:
2111
2112    0 -- coding_type_emacs_mule
2113    1 -- coding_type_sjis
2114    2 -- coding_type_iso2022
2115    3 -- coding_type_big5
2116    4 -- coding_type_ccl encoder/decoder written in CCL
2117    nil -- coding_type_no_conversion
2118    t -- coding_type_undecided (automatic conversion on decoding,
2119                                no-conversion on encoding)
2120
2121    `element[4]' contains information to be set in `coding->flags' and
2122    `coding->spec'.  The meaning varies by `coding->type'.
2123
2124    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2125    of length 32 (of which the first 13 sub-elements are used now).
2126    Meanings of these sub-elements are:
2127
2128    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2129         If the value is an integer of valid charset, the charset is
2130         assumed to be designated to graphic register N initially.
2131
2132         If the value is minus, it is a minus value of charset which
2133         reserves graphic register N, which means that the charset is
2134         not designated initially but should be designated to graphic
2135         register N just before encoding a character in that charset.
2136
2137         If the value is nil, graphic register N is never used on
2138         encoding.
2139
2140    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2141         Each value takes t or nil.  See the section ISO2022 of
2142         `coding.h' for more information.
2143
2144    If `coding->type' is `coding_type_big5', element[4] is t to denote
2145    BIG5-ETen or nil to denote BIG5-HKU.
2146
2147    If `coding->type' takes the other value, element[4] is ignored.
2148
2149    Emacs Lisp's coding system also carries information about format of
2150    end-of-line in a value of property `eol-type'.  If the value is
2151    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2152    means CODING_EOL_CR.  If it is not integer, it should be a vector
2153    of subsidiary coding systems of which property `eol-type' has one
2154    of above values.
2155
2156 */
2157
2158 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2159    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2160    is setup so that no conversion is necessary and return -1, else
2161    return 0.  */
2162
2163 int
2164 setup_coding_system (coding_system, coding)
2165      Lisp_Object coding_system;
2166      struct coding_system *coding;
2167 {
2168   Lisp_Object type, eol_type;
2169
2170   /* At first, set several fields default values.  */
2171   coding->require_flushing = 0;
2172   coding->last_block = 0;
2173   coding->selective = 0;
2174   coding->composing = 0;
2175   coding->direction = 0;
2176   coding->carryover_size = 0;
2177   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2178   coding->character_unification_table_for_decode = Qnil;
2179   coding->character_unification_table_for_encode = Qnil;
2180
2181   Vlast_coding_system_used = coding->symbol = coding_system;
2182   eol_type = Qnil;
2183   /* Get value of property `coding-system' until we get a vector.
2184      While doing that, also get values of properties
2185      `post-read-conversion', `pre-write-conversion',
2186      `character-unification-table-for-decode',
2187      `character-unification-table-for-encode' and `eol-type'.  */
2188   while (!NILP (coding_system) && SYMBOLP (coding_system))
2189     {
2190       if (NILP (coding->post_read_conversion))
2191         coding->post_read_conversion = Fget (coding_system,
2192                                              Qpost_read_conversion);
2193       if (NILP (coding->pre_write_conversion))
2194         coding->pre_write_conversion = Fget (coding_system,
2195                                              Qpre_write_conversion);
2196       if (!inhibit_eol_conversion && NILP (eol_type))
2197         eol_type = Fget (coding_system, Qeol_type);
2198
2199       if (NILP (coding->character_unification_table_for_decode))
2200         coding->character_unification_table_for_decode
2201           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2202
2203       if (NILP (coding->character_unification_table_for_encode))
2204         coding->character_unification_table_for_encode
2205           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2206
2207       coding_system = Fget (coding_system, Qcoding_system);
2208     }
2209
2210   while (!NILP (coding->character_unification_table_for_decode)
2211          && SYMBOLP (coding->character_unification_table_for_decode))
2212         coding->character_unification_table_for_decode
2213           = Fget (coding->character_unification_table_for_decode,
2214                   Qcharacter_unification_table_for_decode);
2215   if (!NILP (coding->character_unification_table_for_decode)
2216       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2217       coding->character_unification_table_for_decode = Qnil;
2218
2219   while (!NILP (coding->character_unification_table_for_encode)
2220          && SYMBOLP (coding->character_unification_table_for_encode))
2221         coding->character_unification_table_for_encode
2222           = Fget (coding->character_unification_table_for_encode,
2223                   Qcharacter_unification_table_for_encode);
2224   if (!NILP (coding->character_unification_table_for_encode)
2225       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2226       coding->character_unification_table_for_encode = Qnil;
2227
2228   if (!VECTORP (coding_system)
2229       || XVECTOR (coding_system)->size != 5)
2230     goto label_invalid_coding_system;
2231
2232   if (VECTORP (eol_type))
2233     coding->eol_type = CODING_EOL_UNDECIDED;
2234   else if (XFASTINT (eol_type) == 1)
2235     coding->eol_type = CODING_EOL_CRLF;
2236   else if (XFASTINT (eol_type) == 2)
2237     coding->eol_type = CODING_EOL_CR;
2238   else
2239     coding->eol_type = CODING_EOL_LF;
2240
2241   type = XVECTOR (coding_system)->contents[0];
2242   switch (XFASTINT (type))
2243     {
2244     case 0:
2245       coding->type = coding_type_emacs_mule;
2246       break;
2247
2248     case 1:
2249       coding->type = coding_type_sjis;
2250       break;
2251
2252     case 2:
2253       coding->type = coding_type_iso2022;
2254       {
2255         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2256         Lisp_Object *flags;
2257         int i, charset, default_reg_bits = 0;
2258
2259         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2260           goto label_invalid_coding_system;
2261
2262         flags = XVECTOR (val)->contents;
2263         coding->flags
2264           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2265              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2266              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2267              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2268              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2269              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2270              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2271              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2272              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2273              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2274              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2275
2276         /* Invoke graphic register 0 to plane 0.  */
2277         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2278         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2279         CODING_SPEC_ISO_INVOCATION (coding, 1)
2280           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2281         /* Not single shifting at first.  */
2282         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2283         /* Beginning of buffer should also be regarded as bol. */
2284         CODING_SPEC_ISO_BOL(coding) = 1;
2285
2286         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2287            FLAGS[REG] can be one of below:
2288                 integer CHARSET: CHARSET occupies register I,
2289                 t: designate nothing to REG initially, but can be used
2290                   by any charsets,
2291                 list of integer, nil, or t: designate the first
2292                   element (if integer) to REG initially, the remaining
2293                   elements (if integer) is designated to REG on request,
2294                   if an element is t, REG can be used by any charset,
2295                 nil: REG is never used.  */
2296         for (charset = 0; charset <= MAX_CHARSET; charset++)
2297           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2298             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2299         for (i = 0; i < 4; i++)
2300           {
2301             if (INTEGERP (flags[i])
2302                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2303                 || (charset = get_charset_id (flags[i])) >= 0)
2304               {
2305                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2306                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2307               }
2308             else if (EQ (flags[i], Qt))
2309               {
2310                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2311                 default_reg_bits |= 1 << i;
2312               }
2313             else if (CONSP (flags[i]))
2314               {
2315                 Lisp_Object tail = flags[i];
2316
2317                 if (INTEGERP (XCONS (tail)->car)
2318                     && (charset = XINT (XCONS (tail)->car),
2319                         CHARSET_VALID_P (charset))
2320                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2321                   {
2322                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2323                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2324                   }
2325                 else
2326                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2327                 tail = XCONS (tail)->cdr;
2328                 while (CONSP (tail))
2329                   {
2330                     if (INTEGERP (XCONS (tail)->car)
2331                         && (charset = XINT (XCONS (tail)->car),
2332                             CHARSET_VALID_P (charset))
2333                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2334                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2335                         = i;
2336                     else if (EQ (XCONS (tail)->car, Qt))
2337                       default_reg_bits |= 1 << i;
2338                     tail = XCONS (tail)->cdr;
2339                   }
2340               }
2341             else
2342               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2343
2344             CODING_SPEC_ISO_DESIGNATION (coding, i)
2345               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2346           }
2347
2348         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2349           {
2350             /* REG 1 can be used only by locking shift in 7-bit env.  */
2351             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2352               default_reg_bits &= ~2;
2353             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2354               /* Without any shifting, only REG 0 and 1 can be used.  */
2355               default_reg_bits &= 3;
2356           }
2357
2358         for (charset = 0; charset <= MAX_CHARSET; charset++)
2359           if (CHARSET_VALID_P (charset)
2360               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2361                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2362             {
2363               /* We have not yet decided where to designate CHARSET.  */
2364               int reg_bits = default_reg_bits;
2365
2366               if (CHARSET_CHARS (charset) == 96)
2367                 /* A charset of CHARS96 can't be designated to REG 0.  */
2368                 reg_bits &= ~1;
2369
2370               if (reg_bits)
2371                 /* There exist some default graphic register.  */
2372                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2373                   = (reg_bits & 1
2374                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2375               else
2376                 /* We anyway have to designate CHARSET to somewhere.  */
2377                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2378                   = (CHARSET_CHARS (charset) == 94
2379                      ? 0
2380                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2381                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2382                         ? 1
2383                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2384                            ? 2 : 0)));
2385             }
2386       }
2387       coding->require_flushing = 1;
2388       break;
2389
2390     case 3:
2391       coding->type = coding_type_big5;
2392       coding->flags
2393         = (NILP (XVECTOR (coding_system)->contents[4])
2394            ? CODING_FLAG_BIG5_HKU
2395            : CODING_FLAG_BIG5_ETEN);
2396       break;
2397
2398     case 4:
2399       coding->type = coding_type_ccl;
2400       {
2401         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2402         if (CONSP  (val)
2403             && VECTORP (XCONS (val)->car)
2404             && VECTORP (XCONS (val)->cdr))
2405           {
2406             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2407             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2408           }
2409         else
2410           goto label_invalid_coding_system;
2411       }
2412       coding->require_flushing = 1;
2413       break;
2414
2415     default:
2416       if (EQ (type, Qt))
2417         coding->type = coding_type_undecided;
2418       else
2419         coding->type = coding_type_no_conversion;
2420       break;
2421     }
2422   return 0;
2423
2424  label_invalid_coding_system:
2425   coding->type = coding_type_no_conversion;
2426   coding->eol_type = CODING_EOL_LF;
2427   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2428     = Qnil;
2429   return -1;
2430 }
2431
2432 /* Emacs has a mechanism to automatically detect a coding system if it
2433    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2434    it's impossible to distinguish some coding systems accurately
2435    because they use the same range of codes.  So, at first, coding
2436    systems are categorized into 7, those are:
2437
2438    o coding-category-emacs-mule
2439
2440         The category for a coding system which has the same code range
2441         as Emacs' internal format.  Assigned the coding-system (Lisp
2442         symbol) `emacs-mule' by default.
2443
2444    o coding-category-sjis
2445
2446         The category for a coding system which has the same code range
2447         as SJIS.  Assigned the coding-system (Lisp
2448         symbol) `shift-jis' by default.
2449
2450    o coding-category-iso-7
2451
2452         The category for a coding system which has the same code range
2453         as ISO2022 of 7-bit environment.  Assigned the coding-system
2454         (Lisp symbol) `iso-2022-7' by default.
2455
2456    o coding-category-iso-8-1
2457
2458         The category for a coding system which has the same code range
2459         as ISO2022 of 8-bit environment and graphic plane 1 used only
2460         for DIMENSION1 charset.  Assigned the coding-system (Lisp
2461         symbol) `iso-8859-1' by default.
2462
2463    o coding-category-iso-8-2
2464
2465         The category for a coding system which has the same code range
2466         as ISO2022 of 8-bit environment and graphic plane 1 used only
2467         for DIMENSION2 charset.  Assigned the coding-system (Lisp
2468         symbol) `euc-japan' by default.
2469
2470    o coding-category-iso-else
2471
2472         The category for a coding system which has the same code range
2473         as ISO2022 but not belongs to any of the above three
2474         categories.  Assigned the coding-system (Lisp symbol)
2475         `iso-2022-ss2-7' by default.
2476
2477    o coding-category-big5
2478
2479         The category for a coding system which has the same code range
2480         as BIG5.  Assigned the coding-system (Lisp symbol)
2481         `cn-big5' by default.
2482
2483    o coding-category-binary
2484
2485         The category for a coding system not categorized in any of the
2486         above.  Assigned the coding-system (Lisp symbol)
2487         `no-conversion' by default.
2488
2489    Each of them is a Lisp symbol and the value is an actual
2490    `coding-system's (this is also a Lisp symbol) assigned by a user.
2491    What Emacs does actually is to detect a category of coding system.
2492    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2493    decide only one possible category, it selects a category of the
2494    highest priority.  Priorities of categories are also specified by a
2495    user in a Lisp variable `coding-category-list'.
2496
2497 */
2498
2499 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2500    If it detects possible coding systems, return an integer in which
2501    appropriate flag bits are set.  Flag bits are defined by macros
2502    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2503
2504 int
2505 detect_coding_mask (src, src_bytes)
2506      unsigned char *src;
2507      int src_bytes;
2508 {
2509   register unsigned char c;
2510   unsigned char *src_end = src + src_bytes;
2511   int mask;
2512
2513   /* At first, skip all ASCII characters and control characters except
2514      for three ISO2022 specific control characters.  */
2515  label_loop_detect_coding:
2516   while (src < src_end)
2517     {
2518       c = *src;
2519       if (c >= 0x80
2520           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2521         break;
2522       src++;
2523     }
2524
2525   if (src >= src_end)
2526     /* We found nothing other than ASCII.  There's nothing to do.  */
2527     return CODING_CATEGORY_MASK_ANY;
2528
2529   /* The text seems to be encoded in some multilingual coding system.
2530      Now, try to find in which coding system the text is encoded.  */
2531   if (c < 0x80)
2532     {
2533       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2534       /* C is an ISO2022 specific control code of C0.  */
2535       mask = detect_coding_iso2022 (src, src_end);
2536       src++;
2537       if (mask == CODING_CATEGORY_MASK_ANY)
2538         /* No valid ISO2022 code follows C.  Try again.  */
2539         goto label_loop_detect_coding;
2540     }
2541   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2542     /* C is an ISO2022 specific control code of C1,
2543        or the first byte of SJIS's 2-byte character code,
2544        or a leading code of Emacs.  */
2545     mask = (detect_coding_iso2022 (src, src_end)
2546             | detect_coding_sjis (src, src_end)
2547             | detect_coding_emacs_mule (src, src_end));
2548
2549   else if (c < 0xA0)
2550     /* C is the first byte of SJIS character code,
2551        or a leading-code of Emacs.  */
2552     mask = (detect_coding_sjis (src, src_end)
2553             | detect_coding_emacs_mule (src, src_end));
2554
2555   else
2556     /* C is a character of ISO2022 in graphic plane right,
2557        or a SJIS's 1-byte character code (i.e. JISX0201),
2558        or the first byte of BIG5's 2-byte code.  */
2559     mask = (detect_coding_iso2022 (src, src_end)
2560             | detect_coding_sjis (src, src_end)
2561             | detect_coding_big5 (src, src_end));
2562
2563   return mask;
2564 }
2565
2566 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2567    The information of the detected coding system is set in CODING.  */
2568
2569 void
2570 detect_coding (coding, src, src_bytes)
2571      struct coding_system *coding;
2572      unsigned char *src;
2573      int src_bytes;
2574 {
2575   int mask = detect_coding_mask (src, src_bytes);
2576   int idx;
2577
2578   if (mask == CODING_CATEGORY_MASK_ANY)
2579     /* We found nothing other than ASCII.  There's nothing to do.  */
2580     return;
2581
2582   if (!mask)
2583     /* The source text seems to be encoded in unknown coding system.
2584        Emacs regards the category of such a kind of coding system as
2585        `coding-category-binary'.  We assume that a user has assigned
2586        an appropriate coding system for a `coding-category-binary'.  */
2587     idx = CODING_CATEGORY_IDX_BINARY;
2588   else
2589     {
2590       /* We found some plausible coding systems.  Let's use a coding
2591          system of the highest priority.  */
2592       Lisp_Object val = Vcoding_category_list;
2593
2594       if (CONSP (val))
2595         while (!NILP (val))
2596           {
2597             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2598             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2599               break;
2600             val = XCONS (val)->cdr;
2601           }
2602       else
2603         val = Qnil;
2604
2605       if (NILP (val))
2606         {
2607           /* For unknown reason, `Vcoding_category_list' contains none
2608              of found categories.  Let's use any of them.  */
2609           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2610             if (mask & (1 << idx))
2611               break;
2612         }
2613     }
2614   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2615 }
2616
2617 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2618    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2619    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2620
2621 int
2622 detect_eol_type (src, src_bytes)
2623      unsigned char *src;
2624      int src_bytes;
2625 {
2626   unsigned char *src_end = src + src_bytes;
2627   unsigned char c;
2628
2629   while (src < src_end)
2630     {
2631       c = *src++;
2632       if (c == '\n')
2633         return CODING_EOL_LF;
2634       else if (c == '\r')
2635         {
2636           if (src < src_end && *src == '\n')
2637             return CODING_EOL_CRLF;
2638           else
2639             return CODING_EOL_CR;
2640         }
2641     }
2642   return CODING_EOL_UNDECIDED;
2643 }
2644
2645 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2646    is encoded.  If it detects an appropriate format of end-of-line, it
2647    sets the information in *CODING.  */
2648
2649 void
2650 detect_eol (coding, src, src_bytes)
2651      struct coding_system *coding;
2652      unsigned char *src;
2653      int src_bytes;
2654 {
2655   Lisp_Object val;
2656   int eol_type = detect_eol_type (src, src_bytes);
2657
2658   if (eol_type == CODING_EOL_UNDECIDED)
2659     /*  We found no end-of-line in the source text.  */
2660     return;
2661
2662   val = Fget (coding->symbol, Qeol_type);
2663   if (VECTORP (val) && XVECTOR (val)->size == 3)
2664     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2665 }
2666
2667 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2668    decoding, it may detect coding system and format of end-of-line if
2669    those are not yet decided.  */
2670
2671 int
2672 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2673      struct coding_system *coding;
2674      unsigned char *source, *destination;
2675      int src_bytes, dst_bytes;
2676      int *consumed;
2677 {
2678   int produced;
2679
2680   if (src_bytes <= 0)
2681     {
2682       *consumed = 0;
2683       return 0;
2684     }
2685
2686   if (coding->type == coding_type_undecided)
2687     detect_coding (coding, source, src_bytes);
2688
2689   if (coding->eol_type == CODING_EOL_UNDECIDED)
2690     detect_eol (coding, source, src_bytes);
2691
2692   coding->carryover_size = 0;
2693   switch (coding->type)
2694     {
2695     case coding_type_no_conversion:
2696     label_no_conversion:
2697       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2698       bcopy (source, destination, produced);
2699       *consumed = produced;
2700       break;
2701
2702     case coding_type_emacs_mule:
2703     case coding_type_undecided:
2704       if (coding->eol_type == CODING_EOL_LF
2705           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2706         goto label_no_conversion;
2707       produced = decode_eol (coding, source, destination,
2708                              src_bytes, dst_bytes, consumed);
2709       break;
2710
2711     case coding_type_sjis:
2712       produced = decode_coding_sjis_big5 (coding, source, destination,
2713                                           src_bytes, dst_bytes, consumed,
2714                                           1);
2715       break;
2716
2717     case coding_type_iso2022:
2718       produced = decode_coding_iso2022 (coding, source, destination,
2719                                         src_bytes, dst_bytes, consumed);
2720       break;
2721
2722     case coding_type_big5:
2723       produced = decode_coding_sjis_big5 (coding, source, destination,
2724                                           src_bytes, dst_bytes, consumed,
2725                                           0);
2726       break;
2727
2728     case coding_type_ccl:
2729       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2730                              src_bytes, dst_bytes, consumed);
2731       break;
2732     }
2733
2734   return produced;
2735 }
2736
2737 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2738
2739 int
2740 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2741      struct coding_system *coding;
2742      unsigned char *source, *destination;
2743      int src_bytes, dst_bytes;
2744      int *consumed;
2745 {
2746   int produced;
2747
2748   coding->carryover_size = 0;
2749   switch (coding->type)
2750     {
2751     case coding_type_no_conversion:
2752     label_no_conversion:
2753       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2754       if (produced > 0)
2755         {
2756           bcopy (source, destination, produced);
2757           if (coding->selective)
2758             {
2759               unsigned char *p = destination, *pend = destination + produced;
2760               while (p < pend)
2761                 if (*p++ == '\015') p[-1] = '\n';
2762             }
2763         }
2764       *consumed = produced;
2765       break;
2766
2767     case coding_type_emacs_mule:
2768     case coding_type_undecided:
2769       if (coding->eol_type == CODING_EOL_LF
2770           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2771         goto label_no_conversion;
2772       produced = encode_eol (coding, source, destination,
2773                              src_bytes, dst_bytes, consumed);
2774       break;
2775
2776     case coding_type_sjis:
2777       produced = encode_coding_sjis_big5 (coding, source, destination,
2778                                           src_bytes, dst_bytes, consumed,
2779                                           1);
2780       break;
2781
2782     case coding_type_iso2022:
2783       produced = encode_coding_iso2022 (coding, source, destination,
2784                                         src_bytes, dst_bytes, consumed);
2785       break;
2786
2787     case coding_type_big5:
2788       produced = encode_coding_sjis_big5 (coding, source, destination,
2789                                           src_bytes, dst_bytes, consumed,
2790                                           0);
2791       break;
2792
2793     case coding_type_ccl:
2794       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2795                              src_bytes, dst_bytes, consumed);
2796       break;
2797     }
2798
2799   return produced;
2800 }
2801
2802 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2803
2804 /* Return maximum size (bytes) of a buffer enough for decoding
2805    SRC_BYTES of text encoded in CODING.  */
2806
2807 int
2808 decoding_buffer_size (coding, src_bytes)
2809      struct coding_system *coding;
2810      int src_bytes;
2811 {
2812   int magnification;
2813
2814   if (coding->type == coding_type_iso2022)
2815     magnification = 3;
2816   else if (coding->type == coding_type_ccl)
2817     magnification = coding->spec.ccl.decoder.buf_magnification;
2818   else
2819     magnification = 2;
2820
2821   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2822 }
2823
2824 /* Return maximum size (bytes) of a buffer enough for encoding
2825    SRC_BYTES of text to CODING.  */
2826
2827 int
2828 encoding_buffer_size (coding, src_bytes)
2829      struct coding_system *coding;
2830      int src_bytes;
2831 {
2832   int magnification;
2833
2834   if (coding->type == coding_type_ccl)
2835     magnification = coding->spec.ccl.encoder.buf_magnification;
2836   else
2837     magnification = 3;
2838
2839   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2840 }
2841
2842 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2843 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2844 #endif
2845
2846 char *conversion_buffer;
2847 int conversion_buffer_size;
2848
2849 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2850    or decoding.  Sufficient memory is allocated automatically.  If we
2851    run out of memory, return NULL.  */
2852
2853 char *
2854 get_conversion_buffer (size)
2855      int size;
2856 {
2857   if (size > conversion_buffer_size)
2858     {
2859       char *buf;
2860       int real_size = conversion_buffer_size * 2;
2861
2862       while (real_size < size) real_size *= 2;
2863       buf = (char *) xmalloc (real_size);
2864       xfree (conversion_buffer);
2865       conversion_buffer = buf;
2866       conversion_buffer_size = real_size;
2867     }
2868   return conversion_buffer;
2869 }
2870
2871 \f
2872 #ifdef emacs
2873 /*** 7. Emacs Lisp library functions ***/
2874
2875 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
2876        1, 1, 0,
2877   "Return coding-spec of CODING-SYSTEM.\n\
2878 If CODING-SYSTEM is not a valid coding-system, return nil.")
2879   (obj)
2880      Lisp_Object obj;
2881 {
2882   while (SYMBOLP (obj) && !NILP (obj))
2883     obj = Fget (obj, Qcoding_system);
2884   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2885           ? Qnil : obj);
2886 }
2887
2888 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2889   "Return t if OBJECT is nil or a coding-system.\n\
2890 See document of make-coding-system for coding-system object.")
2891   (obj)
2892      Lisp_Object obj;
2893 {
2894   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
2895 }
2896
2897 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2898        Sread_non_nil_coding_system, 1, 1, 0,
2899   "Read a coding system from the minibuffer, prompting with string PROMPT.")
2900   (prompt)
2901      Lisp_Object prompt;
2902 {
2903   Lisp_Object val;
2904   do
2905     {
2906       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
2907                               Qt, Qnil, Qnil, Qnil);
2908     }
2909   while (XSTRING (val)->size == 0);
2910   return (Fintern (val, Qnil));
2911 }
2912
2913 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2914   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2915   (prompt)
2916      Lisp_Object prompt;
2917 {
2918   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2919                                       Qt, Qnil, Qnil, Qnil);
2920   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2921 }
2922
2923 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2924        1, 1, 0,
2925   "Check validity of CODING-SYSTEM.\n\
2926 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2927 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2928 The value of property should be a vector of length 5.")
2929   (coding_system)
2930      Lisp_Object coding_system;
2931 {
2932   CHECK_SYMBOL (coding_system, 0);
2933   if (!NILP (Fcoding_system_p (coding_system)))
2934     return coding_system;
2935   while (1)
2936     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
2937 }
2938
2939 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2940        2, 2, 0,
2941   "Detect coding-system of the text in the region between START and END.\n\
2942 Return a list of possible coding-systems ordered by priority.\n\
2943 If only ASCII characters are found, it returns `undecided'\n\
2944  or its subsidiary coding-system according to a detected end-of-line format.")
2945   (b, e)
2946      Lisp_Object b, e;
2947 {
2948   int coding_mask, eol_type;
2949   Lisp_Object val;
2950   int beg, end;
2951
2952   validate_region (&b, &e);
2953   beg = XINT (b), end = XINT (e);
2954   if (beg < GPT && end >= GPT) move_gap (end);
2955
2956   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2957   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
2958
2959   if (coding_mask == CODING_CATEGORY_MASK_ANY)
2960     {
2961       val = intern ("undecided");
2962       if (eol_type != CODING_EOL_UNDECIDED)
2963         {
2964           Lisp_Object val2 = Fget (val, Qeol_type);
2965           if (VECTORP (val2))
2966             val = XVECTOR (val2)->contents[eol_type];
2967         }
2968     }
2969   else
2970     {
2971       Lisp_Object val2;
2972
2973       /* At first, gather possible coding-systems in VAL in a reverse
2974          order.  */
2975       val = Qnil;
2976       for (val2 = Vcoding_category_list;
2977            !NILP (val2);
2978            val2 = XCONS (val2)->cdr)
2979         {
2980           int idx
2981             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2982           if (coding_mask & (1 << idx))
2983             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2984         }
2985
2986       /* Then, change the order of the list, while getting subsidiary
2987          coding-systems.  */
2988       val2 = val;
2989       val = Qnil;
2990       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2991         {
2992           if (eol_type == CODING_EOL_UNDECIDED)
2993             val = Fcons (XCONS (val2)->car, val);
2994           else
2995             {
2996               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2997               if (VECTORP (val3))
2998                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2999               else
3000                 val = Fcons (XCONS (val2)->car, val);
3001             }
3002         }
3003     }
3004
3005   return val;
3006 }
3007
3008 /* Scan text in the region between *BEGP and *ENDP, skip characters
3009    which we never have to encode to (iff ENCODEP is 1) or decode from
3010    coding system CODING at the head and tail, then set BEGP and ENDP
3011    to the addresses of start and end of the text we actually convert.  */
3012
3013 void
3014 shrink_conversion_area (begp, endp, coding, encodep)
3015      unsigned char **begp, **endp;
3016      struct coding_system *coding;
3017      int encodep;
3018 {
3019   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3020
3021   if (coding->eol_type != CODING_EOL_LF
3022       && coding->eol_type != CODING_EOL_UNDECIDED)
3023     /* Since we anyway have to convert end-of-line format, it is not
3024        worth skipping at most 100 bytes or so.  */
3025     return;
3026
3027   if (encodep)                  /* for encoding */
3028     {
3029       switch (coding->type)
3030         {
3031         case coding_type_no_conversion:
3032         case coding_type_emacs_mule:
3033         case coding_type_undecided:
3034           /* We need no conversion.  */
3035           *begp = *endp;
3036           return;
3037         case coding_type_ccl:
3038           /* We can't skip any data.  */
3039           return;
3040         case coding_type_iso2022:
3041           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3042             {
3043               unsigned char *bol = beg_addr;
3044               while (beg_addr < end_addr && *beg_addr < 0x80)
3045                 {
3046                   beg_addr++;
3047                   if (*(beg_addr - 1) == '\n')
3048                     bol = beg_addr;
3049                 }
3050               beg_addr = bol;
3051               goto label_skip_tail;
3052             }
3053           /* fall down ... */
3054         default:
3055           /* We can skip all ASCII characters at the head and tail.  */
3056           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3057         label_skip_tail:
3058           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3059           break;
3060         }
3061     }
3062   else                          /* for decoding */
3063     {
3064       switch (coding->type)
3065         {
3066         case coding_type_no_conversion:
3067           /* We need no conversion.  */
3068           *begp = *endp;
3069           return;
3070         case coding_type_emacs_mule:
3071           if (coding->eol_type == CODING_EOL_LF)
3072             {
3073               /* We need no conversion.  */
3074               *begp = *endp;
3075               return;
3076             }
3077           /* We can skip all but carriage-return.  */
3078           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3079           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3080           break;
3081         case coding_type_sjis:
3082         case coding_type_big5:
3083           /* We can skip all ASCII characters at the head.  */
3084           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3085           /* We can skip all ASCII characters at the tail except for
3086              the second byte of SJIS or BIG5 code.  */
3087           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3088           if (end_addr != *endp)
3089             end_addr++;
3090           break;
3091         case coding_type_ccl:
3092           /* We can't skip any data.  */
3093           return;
3094         default:                /* i.e. case coding_type_iso2022: */
3095           {
3096             unsigned char c;
3097
3098             /* We can skip all ASCII characters except for a few
3099                control codes at the head.  */
3100             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3101                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3102                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3103               beg_addr++;
3104           }
3105           break;
3106         }
3107     }
3108   *begp = beg_addr;
3109   *endp = end_addr;
3110   return;
3111 }
3112
3113 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3114    text between B and E.  B and E are buffer position.  */
3115
3116 Lisp_Object
3117 code_convert_region (b, e, coding, encodep)
3118      Lisp_Object b, e;
3119      struct coding_system *coding;
3120      int encodep;
3121 {
3122   int beg, end, len, consumed, produced;
3123   char *buf;
3124   unsigned char *begp, *endp;
3125   int pos = PT;
3126
3127   validate_region (&b, &e);
3128   beg = XINT (b), end = XINT (e);
3129   if (beg < GPT && end >= GPT)
3130     move_gap (end);
3131
3132   if (encodep && !NILP (coding->pre_write_conversion))
3133     {
3134       /* We must call a pre-conversion function which may put a new
3135          text to be converted in a new buffer.  */
3136       struct buffer *old = current_buffer, *new;
3137
3138       TEMP_SET_PT (beg);
3139       call2 (coding->pre_write_conversion, b, e);
3140       if (old != current_buffer)
3141         {
3142           /* Replace the original text by the text just generated.  */
3143           len = ZV - BEGV;
3144           new = current_buffer;
3145           set_buffer_internal (old);
3146           del_range (beg, end);
3147           insert_from_buffer (new, 1, len, 0);
3148           end = beg + len;
3149         }
3150     }
3151
3152   /* We may be able to shrink the conversion region.  */
3153   begp = POS_ADDR (beg); endp = begp + (end - beg);
3154   shrink_conversion_area (&begp, &endp, coding, encodep);
3155
3156   if (begp == endp)
3157     /* We need no conversion.  */
3158     len = end - beg;
3159   else
3160     {
3161       beg += begp - POS_ADDR (beg);
3162       end =  beg + (endp - begp);
3163
3164       if (encodep)
3165         len = encoding_buffer_size (coding, end - beg);
3166       else
3167         len = decoding_buffer_size (coding, end - beg);
3168       buf = get_conversion_buffer (len);
3169
3170       coding->last_block = 1;
3171       produced = (encodep
3172                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3173                                    &consumed)
3174                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3175                                    &consumed));
3176
3177       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3178
3179       TEMP_SET_PT (beg);
3180       insert (buf, produced);
3181       del_range (PT, PT + end - beg);
3182       if (pos >= end)
3183         pos = PT + (pos - end);
3184       else if (pos > beg)
3185         pos = beg;
3186       TEMP_SET_PT (pos);
3187   }
3188
3189   if (!encodep && !NILP (coding->post_read_conversion))
3190     {
3191       /* We must call a post-conversion function which may alter
3192          the text just converted.  */
3193       Lisp_Object insval;
3194
3195       beg = XINT (b);
3196       TEMP_SET_PT (beg);
3197       insval = call1 (coding->post_read_conversion, make_number (len));
3198       CHECK_NUMBER (insval, 0);
3199       len = XINT (insval);
3200     }
3201
3202   return make_number (len);
3203 }
3204
3205 Lisp_Object
3206 code_convert_string (str, coding, encodep, nocopy)
3207      Lisp_Object str, nocopy;
3208      struct coding_system *coding;
3209      int encodep;
3210 {
3211   int len, consumed, produced;
3212   char *buf;
3213   unsigned char *begp, *endp;
3214   int head_skip, tail_skip;
3215   struct gcpro gcpro1;
3216
3217   if (encodep && !NILP (coding->pre_write_conversion)
3218       || !encodep && !NILP (coding->post_read_conversion))
3219     {
3220       /* Since we have to call Lisp functions which assume target text
3221          is in a buffer, after setting a temporary buffer, call
3222          code_convert_region.  */
3223       int count = specpdl_ptr - specpdl;
3224       int len = XSTRING (str)->size;
3225       Lisp_Object result;
3226       struct buffer *old = current_buffer;
3227
3228       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3229       temp_output_buffer_setup (" *code-converting-work*");
3230       set_buffer_internal (XBUFFER (Vstandard_output));
3231       insert_from_string (str, 0, len, 0);
3232       code_convert_region (make_number (BEGV), make_number (ZV),
3233                            coding, encodep);
3234       result = make_buffer_string (BEGV, ZV, 0);
3235       set_buffer_internal (old);
3236       return unbind_to (count, result);
3237     }
3238
3239   /* We may be able to shrink the conversion region.  */
3240   begp = XSTRING (str)->data;
3241   endp = begp + XSTRING (str)->size;
3242   shrink_conversion_area (&begp, &endp, coding, encodep);
3243
3244   if (begp == endp)
3245     /* We need no conversion.  */
3246     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3247
3248   head_skip = begp - XSTRING (str)->data;
3249   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3250
3251   GCPRO1 (str);
3252
3253   if (encodep)
3254     len = encoding_buffer_size (coding, endp - begp);
3255   else
3256     len = decoding_buffer_size (coding, endp - begp);
3257   buf = get_conversion_buffer (len + head_skip + tail_skip);
3258
3259   bcopy (XSTRING (str)->data, buf, head_skip);
3260   coding->last_block = 1;
3261   produced = (encodep
3262               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3263                                buf + head_skip, endp - begp, len, &consumed)
3264               : decode_coding (coding, XSTRING (str)->data + head_skip,
3265                                buf + head_skip, endp - begp, len, &consumed));
3266   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3267          buf + head_skip + produced,
3268          tail_skip);
3269
3270   UNGCPRO;
3271
3272   return make_string (buf, head_skip + produced + tail_skip);
3273 }
3274
3275 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3276        3, 3, "r\nzCoding system: ",
3277   "Decode current region by specified coding system.\n\
3278 When called from a program, takes three arguments:\n\
3279 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3280 Return length of decoded text.")
3281   (b, e, coding_system)
3282      Lisp_Object b, e, coding_system;
3283 {
3284   struct coding_system coding;
3285
3286   CHECK_NUMBER_COERCE_MARKER (b, 0);
3287   CHECK_NUMBER_COERCE_MARKER (e, 1);
3288   CHECK_SYMBOL (coding_system, 2);
3289
3290   if (NILP (coding_system))
3291     return make_number (XFASTINT (e) - XFASTINT (b));
3292   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3293     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3294
3295   return code_convert_region (b, e, &coding, 0);
3296 }
3297
3298 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3299        3, 3, "r\nzCoding system: ",
3300   "Encode current region by specified coding system.\n\
3301 When called from a program, takes three arguments:\n\
3302 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3303 Return length of encoded text.")
3304   (b, e, coding_system)
3305      Lisp_Object b, e, coding_system;
3306 {
3307   struct coding_system coding;
3308
3309   CHECK_NUMBER_COERCE_MARKER (b, 0);
3310   CHECK_NUMBER_COERCE_MARKER (e, 1);
3311   CHECK_SYMBOL (coding_system, 2);
3312
3313   if (NILP (coding_system))
3314     return make_number (XFASTINT (e) - XFASTINT (b));
3315   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3316     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3317
3318   return code_convert_region (b, e, &coding, 1);
3319 }
3320
3321 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3322        2, 3, 0,
3323   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3324 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3325 of decoding.")
3326   (string, coding_system, nocopy)
3327      Lisp_Object string, coding_system, nocopy;
3328 {
3329   struct coding_system coding;
3330
3331   CHECK_STRING (string, 0);
3332   CHECK_SYMBOL (coding_system, 1);
3333
3334   if (NILP (coding_system))
3335     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3336   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3337     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3338
3339   return code_convert_string (string, &coding, 0, nocopy);
3340 }
3341
3342 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3343        2, 3, 0,
3344   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3345 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3346 of encoding.")
3347   (string, coding_system, nocopy)
3348      Lisp_Object string, coding_system, nocopy;
3349 {
3350   struct coding_system coding;
3351
3352   CHECK_STRING (string, 0);
3353   CHECK_SYMBOL (coding_system, 1);
3354
3355   if (NILP (coding_system))
3356     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3357   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3358     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3359
3360   return code_convert_string (string, &coding, 1, nocopy);
3361 }
3362
3363 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3364   "Decode a JISX0208 character of shift-jis encoding.\n\
3365 CODE is the character code in SJIS.\n\
3366 Return the corresponding character.")
3367   (code)
3368      Lisp_Object code;
3369 {
3370   unsigned char c1, c2, s1, s2;
3371   Lisp_Object val;
3372
3373   CHECK_NUMBER (code, 0);
3374   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3375   DECODE_SJIS (s1, s2, c1, c2);
3376   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3377   return val;
3378 }
3379
3380 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3381   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3382 Return the corresponding character code in SJIS.")
3383   (ch)
3384      Lisp_Object ch;
3385 {
3386   int charset, c1, c2, s1, s2;
3387   Lisp_Object val;
3388
3389   CHECK_NUMBER (ch, 0);
3390   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3391   if (charset == charset_jisx0208)
3392     {
3393       ENCODE_SJIS (c1, c2, s1, s2);
3394       XSETFASTINT (val, (s1 << 8) | s2);
3395     }
3396   else
3397     XSETFASTINT (val, 0);
3398   return val;
3399 }
3400
3401 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3402   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3403 CODE is the character code in BIG5.\n\
3404 Return the corresponding character.")
3405   (code)
3406      Lisp_Object code;
3407 {
3408   int charset;
3409   unsigned char b1, b2, c1, c2;
3410   Lisp_Object val;
3411
3412   CHECK_NUMBER (code, 0);
3413   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3414   DECODE_BIG5 (b1, b2, charset, c1, c2);
3415   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3416   return val;
3417 }
3418
3419 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3420   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3421 Return the corresponding character code in Big5.")
3422   (ch)
3423      Lisp_Object ch;
3424 {
3425   int charset, c1, c2, b1, b2;
3426   Lisp_Object val;
3427
3428   CHECK_NUMBER (ch, 0);
3429   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3430   if (charset == charset_big5_1 || charset == charset_big5_2)
3431     {
3432       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3433       XSETFASTINT (val, (b1 << 8) | b2);
3434     }
3435   else
3436     XSETFASTINT (val, 0);
3437   return val;
3438 }
3439
3440 DEFUN ("set-terminal-coding-system-internal",
3441        Fset_terminal_coding_system_internal,
3442        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3443   (coding_system)
3444      Lisp_Object coding_system;
3445 {
3446   CHECK_SYMBOL (coding_system, 0);
3447   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3448   return Qnil;
3449 }
3450
3451 DEFUN ("terminal-coding-system",
3452        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3453   "Return coding-system of your terminal.")
3454   ()
3455 {
3456   return terminal_coding.symbol;
3457 }
3458
3459 DEFUN ("set-keyboard-coding-system-internal",
3460        Fset_keyboard_coding_system_internal,
3461        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3462   (coding_system)
3463      Lisp_Object coding_system;
3464 {
3465   CHECK_SYMBOL (coding_system, 0);
3466   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3467   return Qnil;
3468 }
3469
3470 DEFUN ("keyboard-coding-system",
3471        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3472   "Return coding-system of what is sent from terminal keyboard.")
3473   ()
3474 {
3475   return keyboard_coding.symbol;
3476 }
3477
3478 \f
3479 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3480        Sfind_operation_coding_system,  1, MANY, 0,
3481   "Choose a coding system for an operation based on the target name.\n\
3482 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3483 DECODING-SYSTEM is the coding system to use for decoding\n\
3484 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3485 for encoding (in case OPERATION does encoding).\n\
3486 \n\
3487 The first argument OPERATION specifies an I/O primitive:\n\
3488   For file I/O, `insert-file-contents' or `write-region'.\n\
3489   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3490   For network I/O, `open-network-stream'.\n\
3491 \n\
3492 The remaining arguments should be the same arguments that were passed\n\
3493 to the primitive.  Depending on which primitive, one of those arguments\n\
3494 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3495 whichever argument specifies the file name is TARGET.\n\
3496 \n\
3497 TARGET has a meaning which depends on OPERATION:\n\
3498   For file I/O, TARGET is a file name.\n\
3499   For process I/O, TARGET is a process name.\n\
3500   For network I/O, TARGET is a service name or a port number\n\
3501 \n\
3502 This function looks up what specified for TARGET in,\n\
3503 `file-coding-system-alist', `process-coding-system-alist',\n\
3504 or `network-coding-system-alist' depending on OPERATION.\n\
3505 They may specify a coding system, a cons of coding systems,\n\
3506 or a function symbol to call.\n\
3507 In the last case, we call the function with one argument,\n\
3508 which is a list of all the arguments given to this function.")
3509   (nargs, args)
3510      int nargs;
3511      Lisp_Object *args;
3512 {
3513   Lisp_Object operation, target_idx, target, val;
3514   register Lisp_Object chain;
3515
3516   if (nargs < 2)
3517     error ("Too few arguments");
3518   operation = args[0];
3519   if (!SYMBOLP (operation)
3520       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3521     error ("Invalid first arguement");
3522   if (nargs < 1 + XINT (target_idx))
3523     error ("Too few arguments for operation: %s",
3524            XSYMBOL (operation)->name->data);
3525   target = args[XINT (target_idx) + 1];
3526   if (!(STRINGP (target)
3527         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3528     error ("Invalid %dth argument", XINT (target_idx) + 1);
3529
3530   chain = ((EQ (operation, Qinsert_file_contents)
3531             || EQ (operation, Qwrite_region))
3532            ? Vfile_coding_system_alist
3533            : (EQ (operation, Qopen_network_stream)
3534               ? Vnetwork_coding_system_alist
3535               : Vprocess_coding_system_alist));
3536   if (NILP (chain))
3537     return Qnil;
3538
3539   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3540     {
3541       Lisp_Object elt = XCONS (chain)->car;
3542
3543       if (CONSP (elt)
3544           && ((STRINGP (target)
3545                && STRINGP (XCONS (elt)->car)
3546                && fast_string_match (XCONS (elt)->car, target) >= 0)
3547               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3548         {
3549           val = XCONS (elt)->cdr;
3550           if (CONSP (val))
3551             return val;
3552           if (! SYMBOLP (val))
3553             return Qnil;
3554           if (! NILP (Fcoding_system_p (val)))
3555             return Fcons (val, val);
3556           if (!NILP (Fboundp (val)))
3557             return call2 (val, Flist (nargs, args));
3558           return Qnil;
3559         }
3560     }
3561   return Qnil;
3562 }
3563
3564 #endif /* emacs */
3565
3566 \f
3567 /*** 8. Post-amble ***/
3568
3569 init_coding_once ()
3570 {
3571   int i;
3572
3573   /* Emacs' internal format specific initialize routine.  */
3574   for (i = 0; i <= 0x20; i++)
3575     emacs_code_class[i] = EMACS_control_code;
3576   emacs_code_class[0x0A] = EMACS_linefeed_code;
3577   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3578   for (i = 0x21 ; i < 0x7F; i++)
3579     emacs_code_class[i] = EMACS_ascii_code;
3580   emacs_code_class[0x7F] = EMACS_control_code;
3581   emacs_code_class[0x80] = EMACS_leading_code_composition;
3582   for (i = 0x81; i < 0xFF; i++)
3583     emacs_code_class[i] = EMACS_invalid_code;
3584   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3585   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3586   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3587   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3588
3589   /* ISO2022 specific initialize routine.  */
3590   for (i = 0; i < 0x20; i++)
3591     iso_code_class[i] = ISO_control_code;
3592   for (i = 0x21; i < 0x7F; i++)
3593     iso_code_class[i] = ISO_graphic_plane_0;
3594   for (i = 0x80; i < 0xA0; i++)
3595     iso_code_class[i] = ISO_control_code;
3596   for (i = 0xA1; i < 0xFF; i++)
3597     iso_code_class[i] = ISO_graphic_plane_1;
3598   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3599   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3600   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3601   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3602   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3603   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3604   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3605   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3606   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3607   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3608
3609   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3610   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3611
3612   setup_coding_system (Qnil, &keyboard_coding);
3613   setup_coding_system (Qnil, &terminal_coding);
3614
3615 #if defined (MSDOS) || defined (WINDOWSNT)
3616   system_eol_type = CODING_EOL_CRLF;
3617 #else
3618   system_eol_type = CODING_EOL_LF;
3619 #endif
3620 }
3621
3622 #ifdef emacs
3623
3624 syms_of_coding ()
3625 {
3626   Qtarget_idx = intern ("target-idx");
3627   staticpro (&Qtarget_idx);
3628
3629   /* Target FILENAME is the first argument.  */
3630   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3631   /* Target FILENAME is the third argument.  */
3632   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3633
3634   Qcall_process = intern ("call-process");
3635   staticpro (&Qcall_process);
3636   /* Target PROGRAM is the first argument.  */
3637   Fput (Qcall_process, Qtarget_idx, make_number (0));
3638
3639   Qcall_process_region = intern ("call-process-region");
3640   staticpro (&Qcall_process_region);
3641   /* Target PROGRAM is the third argument.  */
3642   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3643
3644   Qstart_process = intern ("start-process");
3645   staticpro (&Qstart_process);
3646   /* Target PROGRAM is the third argument.  */
3647   Fput (Qstart_process, Qtarget_idx, make_number (2));
3648
3649   Qopen_network_stream = intern ("open-network-stream");
3650   staticpro (&Qopen_network_stream);
3651   /* Target SERVICE is the fourth argument.  */
3652   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3653
3654   Qcoding_system = intern ("coding-system");
3655   staticpro (&Qcoding_system);
3656
3657   Qeol_type = intern ("eol-type");
3658   staticpro (&Qeol_type);
3659
3660   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3661   staticpro (&Qbuffer_file_coding_system);
3662
3663   Qpost_read_conversion = intern ("post-read-conversion");
3664   staticpro (&Qpost_read_conversion);
3665
3666   Qpre_write_conversion = intern ("pre-write-conversion");
3667   staticpro (&Qpre_write_conversion);
3668
3669   Qcoding_system_spec = intern ("coding-system-spec");
3670   staticpro (&Qcoding_system_spec);
3671
3672   Qcoding_system_p = intern ("coding-system-p");
3673   staticpro (&Qcoding_system_p);
3674
3675   Qcoding_system_error = intern ("coding-system-error");
3676   staticpro (&Qcoding_system_error);
3677
3678   Fput (Qcoding_system_error, Qerror_conditions,
3679         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3680   Fput (Qcoding_system_error, Qerror_message,
3681         build_string ("Invalid coding system"));
3682
3683   Qcoding_category_index = intern ("coding-category-index");
3684   staticpro (&Qcoding_category_index);
3685
3686   {
3687     int i;
3688     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3689       {
3690         coding_category_table[i] = intern (coding_category_name[i]);
3691         staticpro (&coding_category_table[i]);
3692         Fput (coding_category_table[i], Qcoding_category_index,
3693               make_number (i));
3694       }
3695   }
3696
3697   Qcharacter_unification_table = intern ("character-unification-table");
3698   staticpro (&Qcharacter_unification_table);
3699   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3700         make_number (0));
3701
3702   Qcharacter_unification_table_for_decode
3703     = intern ("character-unification-table-for-decode");
3704   staticpro (&Qcharacter_unification_table_for_decode);
3705
3706   Qcharacter_unification_table_for_encode
3707     = intern ("character-unification-table-for-encode");
3708   staticpro (&Qcharacter_unification_table_for_encode);
3709
3710   Qemacs_mule = intern ("emacs-mule");
3711   staticpro (&Qemacs_mule);
3712
3713   defsubr (&Scoding_system_spec);
3714   defsubr (&Scoding_system_p);
3715   defsubr (&Sread_coding_system);
3716   defsubr (&Sread_non_nil_coding_system);
3717   defsubr (&Scheck_coding_system);
3718   defsubr (&Sdetect_coding_region);
3719   defsubr (&Sdecode_coding_region);
3720   defsubr (&Sencode_coding_region);
3721   defsubr (&Sdecode_coding_string);
3722   defsubr (&Sencode_coding_string);
3723   defsubr (&Sdecode_sjis_char);
3724   defsubr (&Sencode_sjis_char);
3725   defsubr (&Sdecode_big5_char);
3726   defsubr (&Sencode_big5_char);
3727   defsubr (&Sset_terminal_coding_system_internal);
3728   defsubr (&Sterminal_coding_system);
3729   defsubr (&Sset_keyboard_coding_system_internal);
3730   defsubr (&Skeyboard_coding_system);
3731   defsubr (&Sfind_operation_coding_system);
3732
3733   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3734     "List of coding-categories (symbols) ordered by priority.");
3735   {
3736     int i;
3737
3738     Vcoding_category_list = Qnil;
3739     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3740       Vcoding_category_list
3741         = Fcons (coding_category_table[i], Vcoding_category_list);
3742   }
3743
3744   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3745     "A variable of internal use only.\n\
3746 If the value is a coding system, it is used for decoding on read operation.\n\
3747 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3748   Vcoding_system_for_read = Qnil;
3749
3750   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3751     "A variable of internal use only.\n\
3752 If the value is a coding system, it is used for encoding on write operation.\n\
3753 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3754   Vcoding_system_for_write = Qnil;
3755
3756   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3757     "Coding-system used in the latest file or process I/O.");
3758   Vlast_coding_system_used = Qnil;
3759
3760   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3761     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3762   inhibit_eol_conversion = 0;
3763
3764   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3765     "Alist to decide a coding system to use for a file I/O operation.\n\
3766 The format is ((PATTERN . VAL) ...),\n\
3767 where PATTERN is a regular expression matching a file name,\n\
3768 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3769 If VAL is a coding system, it is used for both decoding and encoding\n\
3770 the file contents.\n\
3771 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3772 and the cdr part is used for encoding.\n\
3773 If VAL is a function symbol, the function must return a coding system\n\
3774 or a cons of coding systems which are used as above.\n\
3775 \n\
3776 See also the function `find-operation-coding-system'.");
3777   Vfile_coding_system_alist = Qnil;
3778
3779   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3780     "Alist to decide a coding system to use for a process I/O operation.\n\
3781 The format is ((PATTERN . VAL) ...),\n\
3782 where PATTERN is a regular expression matching a program name,\n\
3783 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3784 If VAL is a coding system, it is used for both decoding what received\n\
3785 from the program and encoding what sent to the program.\n\
3786 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3787 and the cdr part is used for encoding.\n\
3788 If VAL is a function symbol, the function must return a coding system\n\
3789 or a cons of coding systems which are used as above.\n\
3790 \n\
3791 See also the function `find-operation-coding-system'.");
3792   Vprocess_coding_system_alist = Qnil;
3793
3794   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3795     "Alist to decide a coding system to use for a network I/O operation.\n\
3796 The format is ((PATTERN . VAL) ...),\n\
3797 where PATTERN is a regular expression matching a network service name\n\
3798 or is a port number to connect to,\n\
3799 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3800 If VAL is a coding system, it is used for both decoding what received\n\
3801 from the network stream and encoding what sent to the network stream.\n\
3802 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3803 and the cdr part is used for encoding.\n\
3804 If VAL is a function symbol, the function must return a coding system\n\
3805 or a cons of coding systems which are used as above.\n\
3806 \n\
3807 See also the function `find-operation-coding-system'.");
3808   Vnetwork_coding_system_alist = Qnil;
3809
3810   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3811     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3812   eol_mnemonic_unix = ':';
3813
3814   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3815     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3816   eol_mnemonic_dos = '\\';
3817
3818   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3819     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3820   eol_mnemonic_mac = '/';
3821
3822   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3823     "Mnemonic character indicating end-of-line format is not yet decided.");
3824   eol_mnemonic_undecided = ':';
3825
3826   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3827     "Non-nil means ISO 2022 encoder/decoder do character unification.");
3828   Venable_character_unification = Qt;
3829
3830   DEFVAR_LISP ("standard-character-unification-table-for-decode",
3831     &Vstandard_character_unification_table_for_decode,
3832     "Table for unifying characters when reading.");
3833   Vstandard_character_unification_table_for_decode = Qnil;
3834
3835   DEFVAR_LISP ("standard-character-unification-table-for-encode",
3836     &Vstandard_character_unification_table_for_encode,
3837     "Table for unifying characters when writing.");
3838   Vstandard_character_unification_table_for_encode = Qnil;
3839
3840   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3841     "Alist of charsets vs revision numbers.\n\
3842 While encoding, if a charset (car part of an element) is found,\n\
3843 designate it with the escape sequence identifing revision (cdr part of the element).");
3844   Vcharset_revision_alist = Qnil;
3845
3846   DEFVAR_LISP ("default-process-coding-system",
3847                &Vdefault_process_coding_system,
3848     "Cons of coding systems used for process I/O by default.\n\
3849 The car part is used for decoding a process output,\n\
3850 the cdr part is used for encoding a text to be sent to a process.");
3851   Vdefault_process_coding_system = Qnil;
3852 }
3853
3854 #endif /* emacs */