src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in the section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and such coding
  53   systems used in Internet communication as ISO-2022-JP are all
  54   variants of ISO2022.  Details are described in the section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   the section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in the section 4.  In this file, when written as "BIG5"
  67   (all uppercase), it means the coding system, and when written as
  68   "Big5" (capitalized), it means the character set.
  69
  70   4. Else
  71
  72   If a user want to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represent a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See the section 6 for more
  81   detail.
  82
  83 */
  84
  85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  86
  87   How end-of-line of a text is encoded depends on a system.  For
  88   instance, Unix's format is just one byte of `line-feed' code,
  89   whereas DOS's format is two bytes sequence of `carriage-return' and
  90   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  91
  92   Since how characters in a text is encoded and how end-of-line is
  93   encoded is independent, any coding system described above can take
  94   any format of end-of-line.  So, Emacs has information of format of
  95   end-of-line in each coding-system.  See the section 6 for more
  96   detail.
  97
  98 */
  99
 100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 101
 102   These functions check if a text between SRC and SRC_END is encoded
 103   in the coding system category XXX.  Each returns an integer value in
 104   which appropriate flag bits for the category XXX is set.  The flag
 105   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 106   template of these functions.  */
 107 #if 0
 108 int
 109 detect_coding_emacs_mule (src, src_end)
 110      unsigned char *src, *src_end;
 111 {
 112   ...
 113 }
 114 #endif
 115
 116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 117
 118   These functions decode SRC_BYTES length text at SOURCE encoded in
 119   CODING to Emacs' internal format (emacs-mule).  The resulting text
 120   goes to a place pointed by DESTINATION, the length of which should
 121   not exceed DST_BYTES.  The bytes actually processed is returned as
 122   *CONSUMED.  The return value is the length of the decoded text.
 123   Below is a template of these functions.  */
 124 #if 0
 125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 126      struct coding_system *coding;
 127      unsigned char *source, *destination;
 128      int src_bytes, dst_bytes;
 129      int *consumed;
 130 {
 131   ...
 132 }
 133 #endif
 134
 135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 136
 137   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 138   internal format (emacs-mule) to CODING.  The resulting text goes to
 139   a place pointed by DESTINATION, the length of which should not
 140   exceed DST_BYTES.  The bytes actually processed is returned as
 141   *CONSUMED.  The return value is the length of the encoded text.
 142   Below is a template of these functions.  */
 143 #if 0
 144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148      int *consumed;
 149 {
 150   ...
 151 }
 152 #endif
 153
 154 /*** COMMONLY USED MACROS ***/
 155
 156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 157    THREE_MORE_BYTES safely get one, two, and three bytes from the
 158    source text respectively.  If there are not enough bytes in the
 159    source, they jump to `label_end_of_loop'.  The caller should set
 160    variables `src' and `src_end' to appropriate areas in advance.  */
 161
 162 #define ONE_MORE_BYTE(c1)       \
 163   do {                          \
 164     if (src < src_end)          \
 165       c1 = *src++;              \
 166     else                        \
 167       goto label_end_of_loop;   \
 168   } while (0)
 169
 170 #define TWO_MORE_BYTES(c1, c2)  \
 171   do {                          \
 172     if (src + 1 < src_end)      \
 173       c1 = *src++, c2 = *src++; \
 174     else                        \
 175       goto label_end_of_loop;   \
 176   } while (0)
 177
 178 #define THREE_MORE_BYTES(c1, c2, c3)            \
 179   do {                                          \
 180     if (src + 2 < src_end)                      \
 181       c1 = *src++, c2 = *src++, c3 = *src++;    \
 182     else                                        \
 183       goto label_end_of_loop;                   \
 184   } while (0)
 185
 186 /* The following three macros DECODE_CHARACTER_ASCII,
 187    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 188    the multi-byte form of a character of each class at the place
 189    pointed by `dst'.  The caller should set the variable `dst' to
 190    point to an appropriate area and the variable `coding' to point to
 191    the coding-system of the currently decoding text in advance.  */
 192
 193 /* Decode one ASCII character C.  */
 194
 195 #define DECODE_CHARACTER_ASCII(c)                               \
 196   do {                                                          \
 197     if (COMPOSING_P (coding->composing))                        \
 198       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 199     else                                                        \
 200       *dst++ = (c);                                             \
 201   } while (0)
 202
 203 /* Decode one DIMENSION1 character of which charset is CHARSET and
 204    position-code is C.  */
 205
 206 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 207   do {                                                                  \
 208     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 209     if (COMPOSING_P (coding->composing))                                \
 210       *dst++ = leading_code + 0x20;                                     \
 211     else                                                                \
 212       *dst++ = leading_code;                                            \
 213     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 214       *dst++ = leading_code;                                            \
 215     *dst++ = (c) | 0x80;                                                \
 216   } while (0)
 217
 218 /* Decode one DIMENSION2 character of which charset is CHARSET and
 219    position-codes are C1 and C2.  */
 220
 221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 222   do {                                                  \
 223     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 224     *dst++ = (c2) | 0x80;                               \
 225   } while (0)
 226
 227 \f
 228 /*** 1. Preamble ***/
 229
 230 #include <stdio.h>
 231
 232 #ifdef emacs
 233
 234 #include <config.h>
 235 #include "lisp.h"
 236 #include "buffer.h"
 237 #include "charset.h"
 238 #include "ccl.h"
 239 #include "coding.h"
 240 #include "window.h"
 241
 242 #else  /* not emacs */
 243
 244 #include "mulelib.h"
 245
 246 #endif /* not emacs */
 247
 248 Lisp_Object Qcoding_system, Qeol_type;
 249 Lisp_Object Qbuffer_file_coding_system;
 250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 251
 252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 254 Lisp_Object Qstart_process, Qopen_network_stream;
 255 Lisp_Object Qtarget_idx;
 256
 257 /* Mnemonic character of each format of end-of-line.  */
 258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 259 /* Mnemonic character to indicate format of end-of-line is not yet
 260    decided.  */
 261 int eol_mnemonic_undecided;
 262
 263 #ifdef emacs
 264
 265 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 266
 267 /* Coding-systems are handed between Emacs Lisp programs and C internal
 268    routines by the following three variables.  */
 269 /* Coding-system for reading files and receiving data from process.  */
 270 Lisp_Object Vcoding_system_for_read;
 271 /* Coding-system for writing files and sending data to process.  */
 272 Lisp_Object Vcoding_system_for_write;
 273 /* Coding-system actually used in the latest I/O.  */
 274 Lisp_Object Vlast_coding_system_used;
 275
 276 /* Coding-system of what terminal accept for displaying.  */
 277 struct coding_system terminal_coding;
 278
 279 /* Coding-system of what is sent from terminal keyboard.  */
 280 struct coding_system keyboard_coding;
 281
 282 Lisp_Object Vfile_coding_system_alist;
 283 Lisp_Object Vprocess_coding_system_alist;
 284 Lisp_Object Vnetwork_coding_system_alist;
 285
 286 #endif /* emacs */
 287
 288 Lisp_Object Qcoding_category_index;
 289
 290 /* List of symbols `coding-category-xxx' ordered by priority.  */
 291 Lisp_Object Vcoding_category_list;
 292
 293 /* Table of coding-systems currently assigned to each coding-category.  */
 294 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 295
 296 /* Table of names of symbol for each coding-category.  */
 297 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 298   "coding-category-emacs-mule",
 299   "coding-category-sjis",
 300   "coding-category-iso-7",
 301   "coding-category-iso-8-1",
 302   "coding-category-iso-8-2",
 303   "coding-category-iso-else",
 304   "coding-category-big5",
 305   "coding-category-binary"
 306 };
 307
 308 /* Flag to tell if we look up unification table on character code
 309    conversion.  */
 310 Lisp_Object Venable_character_unification;
 311 /* Standard unification table to look up on decoding (reading).  */
 312 Lisp_Object Vstandard_character_unification_table_for_decode;
 313 /* Standard unification table to look up on encoding (writing).  */
 314 Lisp_Object Vstandard_character_unification_table_for_encode;
 315
 316 Lisp_Object Qcharacter_unification_table;
 317 Lisp_Object Qcharacter_unification_table_for_decode;
 318 Lisp_Object Qcharacter_unification_table_for_encode;
 319
 320 /* Alist of charsets vs revision number.  */
 321 Lisp_Object Vcharset_revision_alist;
 322
 323 /* Default coding systems used for process I/O.  */
 324 Lisp_Object Vdefault_process_coding_system;
 325
 326 \f
 327 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 328
 329 /* Emacs' internal format for encoding multiple character sets is a
 330    kind of multi-byte encoding, i.e. encoding a character by a sequence
 331    of one-byte codes of variable length.  ASCII characters and control
 332    characters (e.g. `tab', `newline') are represented by one-byte as
 333    is.  It takes the range 0x00 through 0x7F.  The other characters
 334    are represented by a sequence of `base leading-code', optional
 335    `extended leading-code', and one or two `position-code's.  Length
 336    of the sequence is decided by the base leading-code.  Leading-code
 337    takes the range 0x80 through 0x9F, whereas extended leading-code
 338    and position-code take the range 0xA0 through 0xFF.  See the
 339    document of `charset.h' for more detail about leading-code and
 340    position-code.
 341
 342    There's one exception in this rule.  Special leading-code
 343    `leading-code-composition' denotes that the following several
 344    characters should be composed into one character.  Leading-codes of
 345    components (except for ASCII) are added 0x20.  An ASCII character
 346    component is represented by a 2-byte sequence of `0xA0' and
 347    `ASCII-code + 0x80'.  See also the document in `charset.h' for the
 348    detail of composite character.  Hence, we can summarize the code
 349    range as follows:
 350
 351    --- CODE RANGE of Emacs' internal format ---
 352    (character set)      (range)
 353    ASCII                0x00 .. 0x7F
 354    ELSE (1st byte)      0x80 .. 0x9F
 355         (rest bytes)    0xA0 .. 0xFF
 356    ---------------------------------------------
 357
 358   */
 359
 360 enum emacs_code_class_type emacs_code_class[256];
 361
 362 /* Go to the next statement only if *SRC is accessible and the code is
 363    greater than 0xA0.  */
 364 #define CHECK_CODE_RANGE_A0_FF  \
 365   do {                          \
 366     if (src >= src_end)         \
 367       goto label_end_of_switch; \
 368     else if (*src++ < 0xA0)     \
 369       return 0;                 \
 370   } while (0)
 371
 372 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 373    Check if a text is encoded in Emacs' internal format.  If it is,
 374    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 375
 376 int
 377 detect_coding_emacs_mule (src, src_end)
 378      unsigned char *src, *src_end;
 379 {
 380   unsigned char c;
 381   int composing = 0;
 382
 383   while (src < src_end)
 384     {
 385       c = *src++;
 386
 387       if (composing)
 388         {
 389           if (c < 0xA0)
 390             composing = 0;
 391           else
 392             c -= 0x20;
 393         }
 394
 395       switch (emacs_code_class[c])
 396         {
 397         case EMACS_ascii_code:
 398         case EMACS_linefeed_code:
 399           break;
 400
 401         case EMACS_control_code:
 402           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 403             return 0;
 404           break;
 405
 406         case EMACS_invalid_code:
 407           return 0;
 408
 409         case EMACS_leading_code_composition: /* c == 0x80 */
 410           if (composing)
 411             CHECK_CODE_RANGE_A0_FF;
 412           else
 413             composing = 1;
 414           break;
 415
 416         case EMACS_leading_code_4:
 417           CHECK_CODE_RANGE_A0_FF;
 418           /* fall down to check it two more times ...  */
 419
 420         case EMACS_leading_code_3:
 421           CHECK_CODE_RANGE_A0_FF;
 422           /* fall down to check it one more time ...  */
 423
 424         case EMACS_leading_code_2:
 425           CHECK_CODE_RANGE_A0_FF;
 426           break;
 427
 428         default:
 429         label_end_of_switch:
 430           break;
 431         }
 432     }
 433   return CODING_CATEGORY_MASK_EMACS_MULE;
 434 }
 435
 436 \f
 437 /*** 3. ISO2022 handlers ***/
 438
 439 /* The following note describes the coding system ISO2022 briefly.
 440    Since the intension of this note is to help understanding of the
 441    programs in this file, some parts are NOT ACCURATE or OVERLY
 442    SIMPLIFIED.  For the thorough understanding, please refer to the
 443    original document of ISO2022.
 444
 445    ISO2022 provides many mechanisms to encode several character sets
 446    in 7-bit and 8-bit environment.  If one choose 7-bite environment,
 447    all text is encoded by codes of less than 128.  This may make the
 448    encoded text a little bit longer, but the text get more stability
 449    to pass through several gateways (some of them split MSB off).
 450
 451    There are two kind of character set: control character set and
 452    graphic character set.  The former contains control characters such
 453    as `newline' and `escape' to provide control functions (control
 454    functions are provided also by escape sequence).  The latter
 455    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 456    two control character sets and many graphic character sets.
 457
 458    Graphic character sets are classified into one of the following
 459    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 460    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 461    bytes (DIMENSION) and the number of characters in one dimension
 462    (CHARS) of the set.  In addition, each character set is assigned an
 463    identification tag (called "final character" and denoted as <F>
 464    here after) which is unique in each class.  <F> of each character
 465    set is decided by ECMA(*) when it is registered in ISO.  Code range
 466    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 467
 468    Note (*): ECMA = European Computer Manufacturers Association
 469
 470    Here are examples of graphic character set [NAME(<F>)]:
 471         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 472         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 473         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 474         o DIMENSION2_CHARS96 -- none for the moment
 475
 476    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 477         C0 [0x00..0x1F] -- control character plane 0
 478         GL [0x20..0x7F] -- graphic character plane 0
 479         C1 [0x80..0x9F] -- control character plane 1
 480         GR [0xA0..0xFF] -- graphic character plane 1
 481
 482    A control character set is directly designated and invoked to C0 or
 483    C1 by an escape sequence.  The most common case is that ISO646's
 484    control character set is designated/invoked to C0 and ISO6429's
 485    control character set is designated/invoked to C1, and usually
 486    these designations/invocations are omitted in a coded text.  With
 487    7-bit environment, only C0 can be used, and a control character for
 488    C1 is encoded by an appropriate escape sequence to fit in the
 489    environment.  All control characters for C1 are defined the
 490    corresponding escape sequences.
 491
 492    A graphic character set is at first designated to one of four
 493    graphic registers (G0 through G3), then these graphic registers are
 494    invoked to GL or GR.  These designations and invocations can be
 495    done independently.  The most common case is that G0 is invoked to
 496    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 497    these invocations and designations are omitted in a coded text.
 498    With 7-bit environment, only GL can be used.
 499
 500    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 501    and 0x7F of GL area work as control characters SPACE and DEL
 502    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 503
 504    There are two ways of invocation: locking-shift and single-shift.
 505    With locking-shift, the invocation lasts until the next different
 506    invocation, whereas with single-shift, the invocation works only
 507    for the following character and doesn't affect locking-shift.
 508    Invocations are done by the following control characters or escape
 509    sequences.
 510
 511    ----------------------------------------------------------------------
 512    function             control char    escape sequence description
 513    ----------------------------------------------------------------------
 514    SI  (shift-in)               0x0F    none            invoke G0 to GL
 515    SI  (shift-out)              0x0E    none            invoke G1 to GL
 516    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 517    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 518    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 519    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 520    ----------------------------------------------------------------------
 521    The first four are for locking-shift.  Control characters for these
 522    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 523
 524    Designations are done by the following escape sequences.
 525    ----------------------------------------------------------------------
 526    escape sequence      description
 527    ----------------------------------------------------------------------
 528    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 529    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 530    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 531    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 532    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 533    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 534    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 535    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 536    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 537    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 538    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 539    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 540    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 541    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 542    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 543    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 544    ----------------------------------------------------------------------
 545
 546    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 547    of dimension 1, chars 94, and final character <F>, and etc.
 548
 549    Note (*): Although these designations are not allowed in ISO2022,
 550    Emacs accepts them on decoding, and produces them on encoding
 551    CHARS96 character set in a coding system which is characterized as
 552    7-bit environment, non-locking-shift, and non-single-shift.
 553
 554    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 555    '(' can be omitted.  We call this as "short-form" here after.
 556
 557    Now you may notice that there are a lot of ways for encoding the
 558    same multilingual text in ISO2022.  Actually, there exist many
 559    coding systems such as Compound Text (used in X's inter client
 560    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 561    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 562    localized platforms), and all of these are variants of ISO2022.
 563
 564    In addition to the above, Emacs handles two more kinds of escape
 565    sequences: ISO6429's direction specification and Emacs' private
 566    sequence for specifying character composition.
 567
 568    ISO6429's direction specification takes the following format:
 569         o CSI ']'      -- end of the current direction
 570         o CSI '0' ']'  -- end of the current direction
 571         o CSI '1' ']'  -- start of left-to-right text
 572         o CSI '2' ']'  -- start of right-to-left text
 573    The control character CSI (0x9B: control sequence introducer) is
 574    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 575
 576    Character composition specification takes the following format:
 577         o ESC '0' -- start character composition
 578         o ESC '1' -- end character composition
 579    Since these are not standard escape sequences of any ISO, the use
 580    of them for these meaning is restricted to Emacs only.  */
 581
 582 enum iso_code_class_type iso_code_class[256];
 583
 584 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 585    Check if a text is encoded in ISO2022.  If it is, returns an
 586    integer in which appropriate flag bits any of:
 587         CODING_CATEGORY_MASK_ISO_7
 588         CODING_CATEGORY_MASK_ISO_8_1
 589         CODING_CATEGORY_MASK_ISO_8_2
 590         CODING_CATEGORY_MASK_ISO_ELSE
 591    are set.  If a code which should never appear in ISO2022 is found,
 592    returns 0.  */
 593
 594 int
 595 detect_coding_iso2022 (src, src_end)
 596      unsigned char *src, *src_end;
 597 {
 598   int mask = (CODING_CATEGORY_MASK_ISO_7
 599               | CODING_CATEGORY_MASK_ISO_8_1
 600               | CODING_CATEGORY_MASK_ISO_8_2
 601               | CODING_CATEGORY_MASK_ISO_ELSE);
 602   int g1 = 0;                   /* 1 iff designating to G1.  */
 603   int c, i;
 604
 605   while (src < src_end)
 606     {
 607       c = *src++;
 608       switch (c)
 609         {
 610         case ISO_CODE_ESC:
 611           if (src >= src_end)
 612             break;
 613           c = *src++;
 614           if (src < src_end
 615               && ((c >= '(' && c <= '/')
 616                   || c == '$' && ((*src >= '(' && *src <= '/')
 617                                   || (*src >= '@' && *src <= 'B'))))
 618             {
 619               /* Valid designation sequence.  */
 620               if (c == ')' || (c == '$' && *src == ')'))
 621                 {
 622                   g1 = 1;
 623                   mask &= ~CODING_CATEGORY_MASK_ISO_7;
 624                 }
 625               src++;
 626               break;
 627             }
 628           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 629             return CODING_CATEGORY_MASK_ISO_ELSE;
 630           break;
 631
 632         case ISO_CODE_SO:
 633           if (g1)
 634             return CODING_CATEGORY_MASK_ISO_ELSE;
 635           break;
 636
 637         case ISO_CODE_CSI:
 638         case ISO_CODE_SS2:
 639         case ISO_CODE_SS3:
 640           mask &= ~CODING_CATEGORY_MASK_ISO_7;
 641           break;
 642
 643         default:
 644           if (c < 0x80)
 645             break;
 646           else if (c < 0xA0)
 647             return 0;
 648           else
 649             {
 650               int count = 1;
 651
 652               mask &= ~CODING_CATEGORY_MASK_ISO_7;
 653               while (src < src_end && *src >= 0xA0)
 654                 count++, src++;
 655               if (count & 1 && src < src_end)
 656                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 657             }
 658           break;
 659         }
 660     }
 661
 662   return mask;
 663 }
 664
 665 /* Decode a character of which charset is CHARSET and the 1st position
 666    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 667    fetched from SRC and set to C2.  If CHARSET is negative, it means
 668    that we are decoding ill formed text, and what we can do is just to
 669    read C1 as is.  */
 670
 671 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 672   do {                                                                  \
 673     int c_alt, charset_alt = (charset);                                 \
 674     if (COMPOSING_HEAD_P (coding->composing))                           \
 675       {                                                                 \
 676         *dst++ = LEADING_CODE_COMPOSITION;                              \
 677         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 678           /* To tell composition rules are embeded.  */                 \
 679           *dst++ = 0xFF;                                                \
 680         coding->composing += 2;                                         \
 681       }                                                                 \
 682     if ((charset) >= 0)                                                 \
 683       {                                                                 \
 684         if (CHARSET_DIMENSION (charset) == 2)                           \
 685           ONE_MORE_BYTE (c2);                                           \
 686         if (!NILP (unification_table)                                   \
 687             && ((c_alt = unify_char (unification_table,                 \
 688                                      -1, (charset), c1, c2)) >= 0))     \
 689           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 690       }                                                                 \
 691     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 692       DECODE_CHARACTER_ASCII (c1);                                      \
 693     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 694       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 695     else                                                                \
 696       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 697     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 698       /* To tell a composition rule follows.  */                        \
 699       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 700   } while (0)
 701
 702 /* Set designation state into CODING.  */
 703 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 704   do {                                                                  \
 705     int charset = ISO_CHARSET_TABLE (dimension, chars, final_char);     \
 706     if (charset >= 0)                                                   \
 707       {                                                                 \
 708         if (coding->direction == 1                                      \
 709             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 710           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 711         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 712       }                                                                 \
 713   } while (0)
 714
 715 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 716
 717 int
 718 decode_coding_iso2022 (coding, source, destination,
 719                        src_bytes, dst_bytes, consumed)
 720      struct coding_system *coding;
 721      unsigned char *source, *destination;
 722      int src_bytes, dst_bytes;
 723      int *consumed;
 724 {
 725   unsigned char *src = source;
 726   unsigned char *src_end = source + src_bytes;
 727   unsigned char *dst = destination;
 728   unsigned char *dst_end = destination + dst_bytes;
 729   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 730      from DST_END to assure that overflow checking is necessary only
 731      at the head of loop.  */
 732   unsigned char *adjusted_dst_end = dst_end - 6;
 733   int charset;
 734   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 735   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 736   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 737   Lisp_Object unification_table
 738       = coding->character_unification_table_for_decode;
 739
 740   if (!NILP (Venable_character_unification) && NILP (unification_table))
 741     unification_table = Vstandard_character_unification_table_for_decode;
 742
 743   while (src < src_end && dst < adjusted_dst_end)
 744     {
 745       /* SRC_BASE remembers the start position in source in each loop.
 746          The loop will be exited when there's not enough source text
 747          to analyze long escape sequence or 2-byte code (within macros
 748          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 749          to SRC_BASE before exiting.  */
 750       unsigned char *src_base = src;
 751       int c1 = *src++, c2;
 752
 753       switch (iso_code_class [c1])
 754         {
 755         case ISO_0x20_or_0x7F:
 756           if (!coding->composing
 757               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 758             {
 759               /* This is SPACE or DEL.  */
 760               *dst++ = c1;
 761               break;
 762             }
 763           /* This is a graphic character, we fall down ...  */
 764
 765         case ISO_graphic_plane_0:
 766           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 767             {
 768               /* This is a composition rule.  */
 769               *dst++ = c1 | 0x80;
 770               coding->composing = COMPOSING_WITH_RULE_TAIL;
 771             }
 772           else
 773             DECODE_ISO_CHARACTER (charset0, c1);
 774           break;
 775
 776         case ISO_0xA0_or_0xFF:
 777           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 778             {
 779               /* Invalid code.  */
 780               *dst++ = c1;
 781               break;
 782             }
 783           /* This is a graphic character, we fall down ... */
 784
 785         case ISO_graphic_plane_1:
 786           DECODE_ISO_CHARACTER (charset1, c1);
 787           break;
 788
 789         case ISO_control_code:
 790           /* All ISO2022 control characters in this class have the
 791              same representation in Emacs internal format.  */
 792           *dst++ = c1;
 793           break;
 794
 795         case ISO_carriage_return:
 796           if (coding->eol_type == CODING_EOL_CR)
 797             {
 798               *dst++ = '\n';
 799             }
 800           else if (coding->eol_type == CODING_EOL_CRLF)
 801             {
 802               ONE_MORE_BYTE (c1);
 803               if (c1 == ISO_CODE_LF)
 804                 *dst++ = '\n';
 805               else
 806                 {
 807                   src--;
 808                   *dst++ = c1;
 809                 }
 810             }
 811           else
 812             {
 813               *dst++ = c1;
 814             }
 815           break;
 816
 817         case ISO_shift_out:
 818           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 819             goto label_invalid_escape_sequence;
 820           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 821           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 822           break;
 823
 824         case ISO_shift_in:
 825           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 826           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 827           break;
 828
 829         case ISO_single_shift_2_7:
 830         case ISO_single_shift_2:
 831           /* SS2 is handled as an escape sequence of ESC 'N' */
 832           c1 = 'N';
 833           goto label_escape_sequence;
 834
 835         case ISO_single_shift_3:
 836           /* SS2 is handled as an escape sequence of ESC 'O' */
 837           c1 = 'O';
 838           goto label_escape_sequence;
 839
 840         case ISO_control_sequence_introducer:
 841           /* CSI is handled as an escape sequence of ESC '[' ...  */
 842           c1 = '[';
 843           goto label_escape_sequence;
 844
 845         case ISO_escape:
 846           ONE_MORE_BYTE (c1);
 847         label_escape_sequence:
 848           /* Escape sequences handled by Emacs are invocation,
 849              designation, direction specification, and character
 850              composition specification.  */
 851           switch (c1)
 852             {
 853             case '&':           /* revision of following character set */
 854               ONE_MORE_BYTE (c1);
 855               if (!(c1 >= '@' && c1 <= '~'))
 856                 goto label_invalid_escape_sequence;
 857               ONE_MORE_BYTE (c1);
 858               if (c1 != ISO_CODE_ESC)
 859                 goto label_invalid_escape_sequence;
 860               ONE_MORE_BYTE (c1);
 861               goto label_escape_sequence;
 862
 863             case '$':           /* designation of 2-byte character set */
 864               ONE_MORE_BYTE (c1);
 865               if (c1 >= '@' && c1 <= 'B')
 866                 {       /* designation of JISX0208.1978, GB2312.1980,
 867                                    or JISX0208.1980 */
 868                   DECODE_DESIGNATION (0, 2, 94, c1);
 869                 }
 870               else if (c1 >= 0x28 && c1 <= 0x2B)
 871                 {       /* designation of DIMENSION2_CHARS94 character set */
 872                   ONE_MORE_BYTE (c2);
 873                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 874                 }
 875               else if (c1 >= 0x2C && c1 <= 0x2F)
 876                 {       /* designation of DIMENSION2_CHARS96 character set */
 877                   ONE_MORE_BYTE (c2);
 878                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 879                 }
 880               else
 881                 goto label_invalid_escape_sequence;
 882               break;
 883
 884             case 'n':           /* invocation of locking-shift-2 */
 885               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 886                 goto label_invalid_escape_sequence;
 887               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 888               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 889               break;
 890
 891             case 'o':           /* invocation of locking-shift-3 */
 892               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 893                 goto label_invalid_escape_sequence;
 894               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 895               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 896               break;
 897
 898             case 'N':           /* invocation of single-shift-2 */
 899               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 900                 goto label_invalid_escape_sequence;
 901               ONE_MORE_BYTE (c1);
 902               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 903               DECODE_ISO_CHARACTER (charset, c1);
 904               break;
 905
 906             case 'O':           /* invocation of single-shift-3 */
 907               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 908                 goto label_invalid_escape_sequence;
 909               ONE_MORE_BYTE (c1);
 910               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 911               DECODE_ISO_CHARACTER (charset, c1);
 912               break;
 913
 914             case '0':           /* start composing without embeded rules */
 915               coding->composing = COMPOSING_NO_RULE_HEAD;
 916               break;
 917
 918             case '1':           /* end composing */
 919               coding->composing = COMPOSING_NO;
 920               break;
 921
 922             case '2':           /* start composing with embeded rules */
 923               coding->composing = COMPOSING_WITH_RULE_HEAD;
 924               break;
 925
 926             case '[':           /* specification of direction */
 927               /* For the moment, nested direction is not supported.
 928                  So, the value of `coding->direction' is 0 or 1: 0
 929                  means left-to-right, 1 means right-to-left.  */
 930               ONE_MORE_BYTE (c1);
 931               switch (c1)
 932                 {
 933                 case ']':       /* end of the current direction */
 934                   coding->direction = 0;
 935
 936                 case '0':       /* end of the current direction */
 937                 case '1':       /* start of left-to-right direction */
 938                   ONE_MORE_BYTE (c1);
 939                   if (c1 == ']')
 940                     coding->direction = 0;
 941                   else
 942                     goto label_invalid_escape_sequence;
 943                   break;
 944
 945                 case '2':       /* start of right-to-left direction */
 946                   ONE_MORE_BYTE (c1);
 947                   if (c1 == ']')
 948                     coding->direction= 1;
 949                   else
 950                     goto label_invalid_escape_sequence;
 951                   break;
 952
 953                 default:
 954                   goto label_invalid_escape_sequence;
 955                 }
 956               break;
 957
 958             default:
 959               if (c1 >= 0x28 && c1 <= 0x2B)
 960                 {       /* designation of DIMENSION1_CHARS94 character set */
 961                   ONE_MORE_BYTE (c2);
 962                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
 963                 }
 964               else if (c1 >= 0x2C && c1 <= 0x2F)
 965                 {       /* designation of DIMENSION1_CHARS96 character set */
 966                   ONE_MORE_BYTE (c2);
 967                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
 968                 }
 969               else
 970                 {
 971                   goto label_invalid_escape_sequence;
 972                 }
 973             }
 974           /* We must update these variables now.  */
 975           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 976           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 977           break;
 978
 979         label_invalid_escape_sequence:
 980           {
 981             int length = src - src_base;
 982
 983             bcopy (src_base, dst, length);
 984             dst += length;
 985           }
 986         }
 987       continue;
 988
 989     label_end_of_loop:
 990       coding->carryover_size = src - src_base;
 991       bcopy (src_base, coding->carryover, coding->carryover_size);
 992       src = src_base;
 993       break;
 994     }
 995
 996   /* If this is the last block of the text to be decoded, we had
 997      better just flush out all remaining codes in the text although
 998      they are not valid characters.  */
 999   if (coding->last_block)
1000     {
1001       bcopy (src, dst, src_end - src);
1002       dst += (src_end - src);
1003       src = src_end;
1004     }
1005   *consumed = src - source;
1006   return dst - destination;
1007 }
1008
1009 /* ISO2022 encoding staffs.  */
1010
1011 /*
1012    It is not enough to say just "ISO2022" on encoding, but we have to
1013    specify more details.  In Emacs, each coding-system of ISO2022
1014    variant has the following specifications:
1015         1. Initial designation to G0 thru G3.
1016         2. Allows short-form designation?
1017         3. ASCII should be designated to G0 before control characters?
1018         4. ASCII should be designated to G0 at end of line?
1019         5. 7-bit environment or 8-bit environment?
1020         6. Use locking-shift?
1021         7. Use Single-shift?
1022    And the following two are only for Japanese:
1023         8. Use ASCII in place of JIS0201-1976-Roman?
1024         9. Use JISX0208-1983 in place of JISX0208-1978?
1025    These specifications are encoded in `coding->flags' as flag bits
1026    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1027    detail.
1028 */
1029
1030 /* Produce codes (escape sequence) for designating CHARSET to graphic
1031    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1032    the coding system CODING allows, produce designation sequence of
1033    short-form.  */
1034
1035 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1036   do {                                                                  \
1037     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1038     char *intermediate_char_94 = "()*+";                                \
1039     char *intermediate_char_96 = ",-./";                                \
1040     Lisp_Object temp                                                    \
1041       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1042     if (! NILP (temp))                                                  \
1043         {                                                               \
1044         *dst++ = ISO_CODE_ESC;                                          \
1045         *dst++ = '&';                                                   \
1046         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1047       }                                                                 \
1048     *dst++ = ISO_CODE_ESC;                                              \
1049     if (CHARSET_DIMENSION (charset) == 1)                               \
1050       {                                                                 \
1051         if (CHARSET_CHARS (charset) == 94)                              \
1052           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1053         else                                                            \
1054           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1055       }                                                                 \
1056     else                                                                \
1057       {                                                                 \
1058         *dst++ = '$';                                                   \
1059         if (CHARSET_CHARS (charset) == 94)                              \
1060           {                                                             \
1061             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1062                 || reg != 0                                             \
1063                 || final_char < '@' || final_char > 'B')                \
1064               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1065           }                                                             \
1066         else                                                            \
1067           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1068       }                                                                 \
1069     *dst++ = final_char;                                                \
1070     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1071   } while (0)
1072
1073 /* The following two macros produce codes (control character or escape
1074    sequence) for ISO2022 single-shift functions (single-shift-2 and
1075    single-shift-3).  */
1076
1077 #define ENCODE_SINGLE_SHIFT_2                           \
1078   do {                                                  \
1079     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1080       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1081     else                                                \
1082       *dst++ = ISO_CODE_SS2;                            \
1083     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1084   } while (0)
1085
1086 #define ENCODE_SINGLE_SHIFT_3                           \
1087   do {                                                  \
1088     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1089       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1090     else                                                \
1091       *dst++ = ISO_CODE_SS3;                            \
1092     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1093   } while (0)
1094
1095 /* The following four macros produce codes (control character or
1096    escape sequence) for ISO2022 locking-shift functions (shift-in,
1097    shift-out, locking-shift-2, and locking-shift-3).  */
1098
1099 #define ENCODE_SHIFT_IN                         \
1100   do {                                          \
1101     *dst++ = ISO_CODE_SI;                       \
1102     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1103   } while (0)
1104
1105 #define ENCODE_SHIFT_OUT                        \
1106   do {                                          \
1107     *dst++ = ISO_CODE_SO;                       \
1108     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1109   } while (0)
1110
1111 #define ENCODE_LOCKING_SHIFT_2                  \
1112   do {                                          \
1113     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1114     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1115   } while (0)
1116
1117 #define ENCODE_LOCKING_SHIFT_3                  \
1118   do {                                          \
1119     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1120     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1121   } while (0)
1122
1123 /* Produce codes for a DIMENSION1 character of which character set is
1124    CHARSET and position-code is C1.  Designation and invocation
1125    sequences are also produced in advance if necessary.  */
1126
1127
1128 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1129   do {                                                                  \
1130     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1131       {                                                                 \
1132         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1133           *dst++ = c1 & 0x7F;                                           \
1134         else                                                            \
1135           *dst++ = c1 | 0x80;                                           \
1136         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1137         break;                                                          \
1138       }                                                                 \
1139     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1140       {                                                                 \
1141         *dst++ = c1 & 0x7F;                                             \
1142         break;                                                          \
1143       }                                                                 \
1144     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1145       {                                                                 \
1146         *dst++ = c1 | 0x80;                                             \
1147         break;                                                          \
1148       }                                                                 \
1149     else                                                                \
1150       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1151          must invoke it, or, at first, designate it to some graphic     \
1152          register.  Then repeat the loop to actually produce the        \
1153          character.  */                                                 \
1154       dst = encode_invocation_designation (charset, coding, dst);       \
1155   } while (1)
1156
1157 /* Produce codes for a DIMENSION2 character of which character set is
1158    CHARSET and position-codes are C1 and C2.  Designation and
1159    invocation codes are also produced in advance if necessary.  */
1160
1161 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1162   do {                                                                  \
1163     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1164       {                                                                 \
1165         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1166           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1167         else                                                            \
1168           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1169         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1170         break;                                                          \
1171       }                                                                 \
1172     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1173       {                                                                 \
1174         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1175         break;                                                          \
1176       }                                                                 \
1177     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1178       {                                                                 \
1179         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1180         break;                                                          \
1181       }                                                                 \
1182     else                                                                \
1183       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1184          must invoke it, or, at first, designate it to some graphic     \
1185          register.  Then repeat the loop to actually produce the        \
1186          character.  */                                                 \
1187       dst = encode_invocation_designation (charset, coding, dst);       \
1188   } while (1)
1189
1190 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1191   do {                                                                    \
1192     int c_alt, charset_alt;                                               \
1193     if (!NILP (unification_table)                                         \
1194         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1195             >= 0))                                                        \
1196       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1197     else                                                                  \
1198       charset_alt = charset;                                              \
1199     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1200       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1201     else                                                                  \
1202       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1203   } while (0)
1204
1205 /* Produce designation and invocation codes at a place pointed by DST
1206    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1207    Return new DST.  */
1208
1209 unsigned char *
1210 encode_invocation_designation (charset, coding, dst)
1211      int charset;
1212      struct coding_system *coding;
1213      unsigned char *dst;
1214 {
1215   int reg;                      /* graphic register number */
1216
1217   /* At first, check designations.  */
1218   for (reg = 0; reg < 4; reg++)
1219     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1220       break;
1221
1222   if (reg >= 4)
1223     {
1224       /* CHARSET is not yet designated to any graphic registers.  */
1225       /* At first check the requested designation.  */
1226       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1227       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1228         /* Since CHARSET requests no special designation, designate it
1229            to graphic register 0.  */
1230         reg = 0;
1231
1232       ENCODE_DESIGNATION (charset, reg, coding);
1233     }
1234
1235   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1236       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1237     {
1238       /* Since the graphic register REG is not invoked to any graphic
1239          planes, invoke it to graphic plane 0.  */
1240       switch (reg)
1241         {
1242         case 0:                 /* graphic register 0 */
1243           ENCODE_SHIFT_IN;
1244           break;
1245
1246         case 1:                 /* graphic register 1 */
1247           ENCODE_SHIFT_OUT;
1248           break;
1249
1250         case 2:                 /* graphic register 2 */
1251           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1252             ENCODE_SINGLE_SHIFT_2;
1253           else
1254             ENCODE_LOCKING_SHIFT_2;
1255           break;
1256
1257         case 3:                 /* graphic register 3 */
1258           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1259             ENCODE_SINGLE_SHIFT_3;
1260           else
1261             ENCODE_LOCKING_SHIFT_3;
1262           break;
1263         }
1264     }
1265   return dst;
1266 }
1267
1268 /* The following two macros produce codes for indicating composition.  */
1269 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1270 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1271 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1272
1273 /* The following three macros produce codes for indicating direction
1274    of text.  */
1275 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1276   do {                                                  \
1277     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1278       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1279     else                                                \
1280       *dst++ = ISO_CODE_CSI;                            \
1281   } while (0)
1282
1283 #define ENCODE_DIRECTION_R2L    \
1284   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1285
1286 #define ENCODE_DIRECTION_L2R    \
1287   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1288
1289 /* Produce codes for designation and invocation to reset the graphic
1290    planes and registers to initial state.  */
1291 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1292   do {                                                                      \
1293     int reg;                                                                \
1294     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1295       ENCODE_SHIFT_IN;                                                      \
1296     for (reg = 0; reg < 4; reg++)                                           \
1297       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1298           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1299               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1300         ENCODE_DESIGNATION                                                  \
1301           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1302   } while (0)
1303
1304 /* Produce designation sequences of charsets in the line started from
1305    *SRC to a place pointed by DSTP.
1306
1307    If the current block ends before any end-of-line, we may fail to
1308    find all the necessary *designations.  */
1309 encode_designation_at_bol (coding, table, src, src_end, dstp)
1310      struct coding_system *coding;
1311      Lisp_Object table;
1312      unsigned char *src, *src_end, **dstp;
1313 {
1314   int charset, c, found = 0, reg;
1315   /* Table of charsets to be designated to each graphic register.  */
1316   int r[4];
1317   unsigned char *dst = *dstp;
1318
1319   for (reg = 0; reg < 4; reg++)
1320     r[reg] = -1;
1321
1322   while (src < src_end && *src != '\n' && found < 4)
1323     {
1324       int bytes = BYTES_BY_CHAR_HEAD (*src);
1325
1326       if (NILP (table))
1327         charset = CHARSET_AT (src);
1328       else
1329         {
1330           int c_alt, c1, c2;
1331
1332           SPLIT_STRING(src, bytes, charset, c1, c2);
1333           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1334             charset = CHAR_CHARSET (c_alt);
1335         }
1336
1337       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1338       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1339         {
1340           found++;
1341           r[reg] = charset;
1342         }
1343
1344       src += bytes;
1345     }
1346
1347   if (found)
1348     {
1349       for (reg = 0; reg < 4; reg++)
1350         if (r[reg] >= 0
1351             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1352           ENCODE_DESIGNATION (r[reg], reg, coding);
1353       *dstp = dst;
1354     }
1355 }
1356
1357 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1358
1359 int
1360 encode_coding_iso2022 (coding, source, destination,
1361                        src_bytes, dst_bytes, consumed)
1362      struct coding_system *coding;
1363      unsigned char *source, *destination;
1364      int src_bytes, dst_bytes;
1365      int *consumed;
1366 {
1367   unsigned char *src = source;
1368   unsigned char *src_end = source + src_bytes;
1369   unsigned char *dst = destination;
1370   unsigned char *dst_end = destination + dst_bytes;
1371   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1372      from DST_END to assure overflow checking is necessary only at the
1373      head of loop.  */
1374   unsigned char *adjusted_dst_end = dst_end - 19;
1375   Lisp_Object unification_table
1376       = coding->character_unification_table_for_encode;
1377
1378   if (!NILP (Venable_character_unification) && NILP (unification_table))
1379     unification_table = Vstandard_character_unification_table_for_encode;
1380
1381   while (src < src_end && dst < adjusted_dst_end)
1382     {
1383       /* SRC_BASE remembers the start position in source in each loop.
1384          The loop will be exited when there's not enough source text
1385          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1386          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1387          reset to SRC_BASE before exiting.  */
1388       unsigned char *src_base = src;
1389       int charset, c1, c2, c3, c4;
1390
1391       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1392           && CODING_SPEC_ISO_BOL (coding))
1393         {
1394           /* We have to produce designation sequences if any now.  */
1395           encode_designation_at_bol (coding, unification_table,
1396                                      src, src_end, &dst);
1397           CODING_SPEC_ISO_BOL (coding) = 0;
1398         }
1399
1400       c1 = *src++;
1401       /* If we are seeing a component of a composite character, we are
1402          seeing a leading-code specially encoded for composition, or a
1403          composition rule if composing with rule.  We must set C1
1404          to a normal leading-code or an ASCII code.  If we are not at
1405          a composed character, we must reset the composition state.  */
1406       if (COMPOSING_P (coding->composing))
1407         {
1408           if (c1 < 0xA0)
1409             {
1410               /* We are not in a composite character any longer.  */
1411               coding->composing = COMPOSING_NO;
1412               ENCODE_COMPOSITION_END;
1413             }
1414           else
1415             {
1416               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1417                 {
1418                   *dst++ = c1 & 0x7F;
1419                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1420                   continue;
1421                 }
1422               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1423                 coding->composing = COMPOSING_WITH_RULE_RULE;
1424               if (c1 == 0xA0)
1425                 {
1426                   /* This is an ASCII component.  */
1427                   ONE_MORE_BYTE (c1);
1428                   c1 &= 0x7F;
1429                 }
1430               else
1431                 /* This is a leading-code of non ASCII component.  */
1432                 c1 -= 0x20;
1433             }
1434         }
1435
1436       /* Now encode one character.  C1 is a control character, an
1437          ASCII character, or a leading-code of multi-byte character.  */
1438       switch (emacs_code_class[c1])
1439         {
1440         case EMACS_ascii_code:
1441           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1442           break;
1443
1444         case EMACS_control_code:
1445           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1446             ENCODE_RESET_PLANE_AND_REGISTER;
1447           *dst++ = c1;
1448           break;
1449
1450         case EMACS_carriage_return_code:
1451           if (!coding->selective)
1452             {
1453               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1454                 ENCODE_RESET_PLANE_AND_REGISTER;
1455               *dst++ = c1;
1456               break;
1457             }
1458           /* fall down to treat '\r' as '\n' ...  */
1459
1460         case EMACS_linefeed_code:
1461           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1462             ENCODE_RESET_PLANE_AND_REGISTER;
1463           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1464             bcopy (coding->spec.iso2022.initial_designation,
1465                    coding->spec.iso2022.current_designation,
1466                    sizeof coding->spec.iso2022.initial_designation);
1467           if (coding->eol_type == CODING_EOL_LF
1468               || coding->eol_type == CODING_EOL_UNDECIDED)
1469             *dst++ = ISO_CODE_LF;
1470           else if (coding->eol_type == CODING_EOL_CRLF)
1471             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1472           else
1473             *dst++ = ISO_CODE_CR;
1474           CODING_SPEC_ISO_BOL (coding) = 1;
1475           break;
1476
1477         case EMACS_leading_code_2:
1478           ONE_MORE_BYTE (c2);
1479           ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1480           break;
1481
1482         case EMACS_leading_code_3:
1483           TWO_MORE_BYTES (c2, c3);
1484           if (c1 < LEADING_CODE_PRIVATE_11)
1485             ENCODE_ISO_CHARACTER (c1, c2, c3);
1486           else
1487             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1488           break;
1489
1490         case EMACS_leading_code_4:
1491           THREE_MORE_BYTES (c2, c3, c4);
1492           ENCODE_ISO_CHARACTER (c2, c3, c4);
1493           break;
1494
1495         case EMACS_leading_code_composition:
1496           ONE_MORE_BYTE (c1);
1497           if (c1 == 0xFF)
1498             {
1499               coding->composing = COMPOSING_WITH_RULE_HEAD;
1500               ENCODE_COMPOSITION_WITH_RULE_START;
1501             }
1502           else
1503             {
1504               /* Rewind one byte because it is a character code of
1505                  composition elements.  */
1506               src--;
1507               coding->composing = COMPOSING_NO_RULE_HEAD;
1508               ENCODE_COMPOSITION_NO_RULE_START;
1509             }
1510           break;
1511
1512         case EMACS_invalid_code:
1513           *dst++ = c1;
1514           break;
1515         }
1516       continue;
1517     label_end_of_loop:
1518       coding->carryover_size = src - src_base;
1519       bcopy (src_base, coding->carryover, coding->carryover_size);
1520       break;
1521     }
1522
1523   /* If this is the last block of the text to be encoded, we must
1524      reset graphic planes and registers to the initial state.  */
1525   if (src >= src_end && coding->last_block)
1526     {
1527       ENCODE_RESET_PLANE_AND_REGISTER;
1528       if (coding->carryover_size > 0
1529           && coding->carryover_size < (dst_end - dst))
1530         {
1531           bcopy (coding->carryover, dst, coding->carryover_size);
1532           dst += coding->carryover_size;
1533           coding->carryover_size = 0;
1534         }
1535     }
1536   *consumed = src - source;
1537   return dst - destination;
1538 }
1539
1540 \f
1541 /*** 4. SJIS and BIG5 handlers ***/
1542
1543 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1544    quite widely.  So, for the moment, Emacs supports them in the bare
1545    C code.  But, in the future, they may be supported only by CCL.  */
1546
1547 /* SJIS is a coding system encoding three character sets: ASCII, right
1548    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1549    as is.  A character of charset katakana-jisx0201 is encoded by
1550    "position-code + 0x80".  A character of charset japanese-jisx0208
1551    is encoded in 2-byte but two position-codes are divided and shifted
1552    so that it fit in the range below.
1553
1554    --- CODE RANGE of SJIS ---
1555    (character set)      (range)
1556    ASCII                0x00 .. 0x7F
1557    KATAKANA-JISX0201    0xA0 .. 0xDF
1558    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1559             (2nd byte)  0x40 .. 0xFF
1560    -------------------------------
1561
1562 */
1563
1564 /* BIG5 is a coding system encoding two character sets: ASCII and
1565    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1566    character set and is encoded in two-byte.
1567
1568    --- CODE RANGE of BIG5 ---
1569    (character set)      (range)
1570    ASCII                0x00 .. 0x7F
1571    Big5 (1st byte)      0xA1 .. 0xFE
1572         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1573    --------------------------
1574
1575    Since the number of characters in Big5 is larger than maximum
1576    characters in Emacs' charset (96x96), it can't be handled as one
1577    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1578    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1579    contains frequently used characters and the latter contains less
1580    frequently used characters.  */
1581
1582 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1583    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1584    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1585    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1586
1587 /* Number of Big5 characters which have the same code in 1st byte.  */
1588 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1589
1590 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1591   do {                                                                  \
1592     unsigned int temp                                                   \
1593       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1594     if (b1 < 0xC9)                                                      \
1595       charset = charset_big5_1;                                         \
1596     else                                                                \
1597       {                                                                 \
1598         charset = charset_big5_2;                                       \
1599         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1600       }                                                                 \
1601     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1602     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1603   } while (0)
1604
1605 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1606   do {                                                                  \
1607     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1608     if (charset == charset_big5_2)                                      \
1609       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1610     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1611     b2 = temp % BIG5_SAME_ROW;                                          \
1612     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1613   } while (0)
1614
1615 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1616   do {                                                                  \
1617     int c_alt, charset_alt = (charset);                                 \
1618     if (!NILP (unification_table)                                       \
1619         && ((c_alt = unify_char (unification_table,                     \
1620                                  -1, (charset), c1, c2)) >= 0))         \
1621           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1622     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1623       DECODE_CHARACTER_ASCII (c1);                                      \
1624     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1625       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1626     else                                                                \
1627       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1628   } while (0)
1629
1630 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1631   do {                                                                    \
1632     int c_alt, charset_alt;                                               \
1633     if (!NILP (unification_table)                                         \
1634         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1635             >= 0))                                                        \
1636       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1637     else                                                                  \
1638       charset_alt = charset;                                              \
1639     if (charset_alt == charset_ascii)                                     \
1640       *dst++ = c1;                                                        \
1641     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1642       {                                                                   \
1643         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1644           *dst++ = c1;                                                    \
1645         else                                                              \
1646           *dst++ = charset_alt, *dst++ = c1;                              \
1647       }                                                                   \
1648     else                                                                  \
1649       {                                                                   \
1650         c1 &= 0x7F, c2 &= 0x7F;                                           \
1651         if (sjis_p && charset_alt == charset_jisx0208)                    \
1652           {                                                               \
1653             unsigned char s1, s2;                                         \
1654                                                                           \
1655             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1656             *dst++ = s1, *dst++ = s2;                                     \
1657           }                                                               \
1658         else if (!sjis_p                                                  \
1659                  && (charset_alt == charset_big5_1                        \
1660                      || charset_alt == charset_big5_2))                   \
1661           {                                                               \
1662             unsigned char b1, b2;                                         \
1663                                                                           \
1664             ENCODE_BIG5 (c1, c2, c3, b1, b2);                             \
1665             *dst++ = b1, *dst++ = b2;                                     \
1666           }                                                               \
1667         else                                                              \
1668           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1669       }                                                                   \
1670   } while (0);
1671
1672 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1673    Check if a text is encoded in SJIS.  If it is, return
1674    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1675
1676 int
1677 detect_coding_sjis (src, src_end)
1678      unsigned char *src, *src_end;
1679 {
1680   unsigned char c;
1681
1682   while (src < src_end)
1683     {
1684       c = *src++;
1685       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1686         return 0;
1687       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1688         {
1689           if (src < src_end && *src++ < 0x40)
1690             return 0;
1691         }
1692     }
1693   return CODING_CATEGORY_MASK_SJIS;
1694 }
1695
1696 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1697    Check if a text is encoded in BIG5.  If it is, return
1698    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1699
1700 int
1701 detect_coding_big5 (src, src_end)
1702      unsigned char *src, *src_end;
1703 {
1704   unsigned char c;
1705
1706   while (src < src_end)
1707     {
1708       c = *src++;
1709       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1710         return 0;
1711       if (c >= 0xA1)
1712         {
1713           if (src >= src_end)
1714             break;
1715           c = *src++;
1716           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1717             return 0;
1718         }
1719     }
1720   return CODING_CATEGORY_MASK_BIG5;
1721 }
1722
1723 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1724    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1725
1726 int
1727 decode_coding_sjis_big5 (coding, source, destination,
1728                          src_bytes, dst_bytes, consumed, sjis_p)
1729      struct coding_system *coding;
1730      unsigned char *source, *destination;
1731      int src_bytes, dst_bytes;
1732      int *consumed;
1733      int sjis_p;
1734 {
1735   unsigned char *src = source;
1736   unsigned char *src_end = source + src_bytes;
1737   unsigned char *dst = destination;
1738   unsigned char *dst_end = destination + dst_bytes;
1739   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1740      from DST_END to assure overflow checking is necessary only at the
1741      head of loop.  */
1742   unsigned char *adjusted_dst_end = dst_end - 3;
1743   Lisp_Object unification_table
1744       = coding->character_unification_table_for_decode;
1745
1746   if (!NILP (Venable_character_unification) && NILP (unification_table))
1747     unification_table = Vstandard_character_unification_table_for_decode;
1748
1749   while (src < src_end && dst < adjusted_dst_end)
1750     {
1751       /* SRC_BASE remembers the start position in source in each loop.
1752          The loop will be exited when there's not enough source text
1753          to analyze two-byte character (within macro ONE_MORE_BYTE).
1754          In that case, SRC is reset to SRC_BASE before exiting.  */
1755       unsigned char *src_base = src;
1756       unsigned char c1 = *src++, c2, c3, c4;
1757
1758       if (c1 == '\r')
1759         {
1760           if (coding->eol_type == CODING_EOL_CRLF)
1761             {
1762               ONE_MORE_BYTE (c2);
1763               if (c2 == '\n')
1764                 *dst++ = c2;
1765               else
1766                 /* To process C2 again, SRC is subtracted by 1.  */
1767                 *dst++ = c1, src--;
1768             }
1769           else
1770             *dst++ = c1;
1771         }
1772       else if (c1 < 0x20)
1773         *dst++ = c1;
1774       else if (c1 < 0x80)
1775         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1776       else if (c1 < 0xA0 || c1 >= 0xE0)
1777         {
1778           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1779           if (sjis_p)
1780             {
1781               ONE_MORE_BYTE (c2);
1782               DECODE_SJIS (c1, c2, c3, c4);
1783               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1784             }
1785           else if (c1 >= 0xE0 && c1 < 0xFF)
1786             {
1787               int charset;
1788
1789               ONE_MORE_BYTE (c2);
1790               DECODE_BIG5 (c1, c2, charset, c3, c4);
1791               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1792             }
1793           else                  /* Invalid code */
1794             *dst++ = c1;
1795         }
1796       else
1797         {
1798           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1799           if (sjis_p)
1800             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1801           else
1802             {
1803               int charset;
1804
1805               ONE_MORE_BYTE (c2);
1806               DECODE_BIG5 (c1, c2, charset, c3, c4);
1807               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1808             }
1809         }
1810       continue;
1811
1812     label_end_of_loop:
1813       coding->carryover_size = src - src_base;
1814       bcopy (src_base, coding->carryover, coding->carryover_size);
1815       src = src_base;
1816       break;
1817     }
1818
1819   *consumed = src - source;
1820   return dst - destination;
1821 }
1822
1823 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1824    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1825    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1826    sure that all these charsets are registered as official charset
1827    (i.e. do not have extended leading-codes).  Characters of other
1828    charsets are produced without any encoding.  If SJIS_P is 1, encode
1829    SJIS text, else encode BIG5 text.  */
1830
1831 int
1832 encode_coding_sjis_big5 (coding, source, destination,
1833                          src_bytes, dst_bytes, consumed, sjis_p)
1834      struct coding_system *coding;
1835      unsigned char *source, *destination;
1836      int src_bytes, dst_bytes;
1837      int *consumed;
1838      int sjis_p;
1839 {
1840   unsigned char *src = source;
1841   unsigned char *src_end = source + src_bytes;
1842   unsigned char *dst = destination;
1843   unsigned char *dst_end = destination + dst_bytes;
1844   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1845      from DST_END to assure overflow checking is necessary only at the
1846      head of loop.  */
1847   unsigned char *adjusted_dst_end = dst_end - 1;
1848   Lisp_Object unification_table
1849       = coding->character_unification_table_for_encode;
1850
1851   if (!NILP (Venable_character_unification) && NILP (unification_table))
1852     unification_table = Vstandard_character_unification_table_for_encode;
1853
1854   while (src < src_end && dst < adjusted_dst_end)
1855     {
1856       /* SRC_BASE remembers the start position in source in each loop.
1857          The loop will be exited when there's not enough source text
1858          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1859          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1860          before exiting.  */
1861       unsigned char *src_base = src;
1862       unsigned char c1 = *src++, c2, c3, c4;
1863
1864       if (coding->composing)
1865         {
1866           if (c1 == 0xA0)
1867             {
1868               ONE_MORE_BYTE (c1);
1869               c1 &= 0x7F;
1870             }
1871           else if (c1 >= 0xA0)
1872             c1 -= 0x20;
1873           else
1874             coding->composing = 0;
1875         }
1876
1877       switch (emacs_code_class[c1])
1878         {
1879         case EMACS_ascii_code:
1880           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1881           break;
1882
1883         case EMACS_control_code:
1884           *dst++ = c1;
1885           break;
1886
1887         case EMACS_carriage_return_code:
1888           if (!coding->selective)
1889             {
1890               *dst++ = c1;
1891               break;
1892             }
1893           /* fall down to treat '\r' as '\n' ...  */
1894
1895         case EMACS_linefeed_code:
1896           if (coding->eol_type == CODING_EOL_LF
1897               || coding->eol_type == CODING_EOL_UNDECIDED)
1898             *dst++ = '\n';
1899           else if (coding->eol_type == CODING_EOL_CRLF)
1900             *dst++ = '\r', *dst++ = '\n';
1901           else
1902             *dst++ = '\r';
1903           break;
1904
1905         case EMACS_leading_code_2:
1906           ONE_MORE_BYTE (c2);
1907           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
1908           break;
1909
1910         case EMACS_leading_code_3:
1911           TWO_MORE_BYTES (c2, c3);
1912           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
1913           break;
1914
1915         case EMACS_leading_code_4:
1916           THREE_MORE_BYTES (c2, c3, c4);
1917           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
1918           break;
1919
1920         case EMACS_leading_code_composition:
1921           coding->composing = 1;
1922           break;
1923
1924         default:                /* i.e. case EMACS_invalid_code: */
1925           *dst++ = c1;
1926         }
1927       continue;
1928
1929     label_end_of_loop:
1930       coding->carryover_size = src - src_base;
1931       bcopy (src_base, coding->carryover, coding->carryover_size);
1932       src = src_base;
1933       break;
1934     }
1935
1936   *consumed = src - source;
1937   return dst - destination;
1938 }
1939
1940 \f
1941 /*** 5. End-of-line handlers ***/
1942
1943 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1944    This function is called only when `coding->eol_type' is
1945    CODING_EOL_CRLF or CODING_EOL_CR.  */
1946
1947 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1948      struct coding_system *coding;
1949      unsigned char *source, *destination;
1950      int src_bytes, dst_bytes;
1951      int *consumed;
1952 {
1953   unsigned char *src = source;
1954   unsigned char *src_end = source + src_bytes;
1955   unsigned char *dst = destination;
1956   unsigned char *dst_end = destination + dst_bytes;
1957   int produced;
1958
1959   switch (coding->eol_type)
1960     {
1961     case CODING_EOL_CRLF:
1962       {
1963         /* Since the maximum bytes produced by each loop is 2, we
1964            subtract 1 from DST_END to assure overflow checking is
1965            necessary only at the head of loop.  */
1966         unsigned char *adjusted_dst_end = dst_end - 1;
1967
1968         while (src < src_end && dst < adjusted_dst_end)
1969           {
1970             unsigned char *src_base = src;
1971             unsigned char c = *src++;
1972             if (c == '\r')
1973               {
1974                 ONE_MORE_BYTE (c);
1975                 if (c != '\n')
1976                   *dst++ = '\r';
1977                 *dst++ = c;
1978               }
1979             else
1980               *dst++ = c;
1981             continue;
1982
1983           label_end_of_loop:
1984             coding->carryover_size = src - src_base;
1985             bcopy (src_base, coding->carryover, coding->carryover_size);
1986             src = src_base;
1987             break;
1988           }
1989         *consumed = src - source;
1990         produced = dst - destination;
1991         break;
1992       }
1993
1994     case CODING_EOL_CR:
1995       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1996       bcopy (source, destination, produced);
1997       dst_end = destination + produced;
1998       while (dst < dst_end)
1999         if (*dst++ == '\r') dst[-1] = '\n';
2000       *consumed = produced;
2001       break;
2002
2003     default:                    /* i.e. case: CODING_EOL_LF */
2004       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2005       bcopy (source, destination, produced);
2006       *consumed = produced;
2007       break;
2008     }
2009
2010   return produced;
2011 }
2012
2013 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2014    format of end-of-line according to `coding->eol_type'.  If
2015    `coding->selective' is 1, code '\r' in source text also means
2016    end-of-line.  */
2017
2018 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2019      struct coding_system *coding;
2020      unsigned char *source, *destination;
2021      int src_bytes, dst_bytes;
2022      int *consumed;
2023 {
2024   unsigned char *src = source;
2025   unsigned char *dst = destination;
2026   int produced;
2027
2028   if (src_bytes <= 0)
2029     return 0;
2030
2031   switch (coding->eol_type)
2032     {
2033     case CODING_EOL_LF:
2034     case CODING_EOL_UNDECIDED:
2035       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2036       bcopy (source, destination, produced);
2037       if (coding->selective)
2038         {
2039           int i = produced;
2040           while (i--)
2041             if (*dst++ == '\r') dst[-1] = '\n';
2042         }
2043       *consumed = produced;
2044
2045     case CODING_EOL_CRLF:
2046       {
2047         unsigned char c;
2048         unsigned char *src_end = source + src_bytes;
2049         unsigned char *dst_end = destination + dst_bytes;
2050         /* Since the maximum bytes produced by each loop is 2, we
2051            subtract 1 from DST_END to assure overflow checking is
2052            necessary only at the head of loop.  */
2053         unsigned char *adjusted_dst_end = dst_end - 1;
2054
2055         while (src < src_end && dst < adjusted_dst_end)
2056           {
2057             c = *src++;
2058             if (c == '\n' || (c == '\r' && coding->selective))
2059               *dst++ = '\r', *dst++ = '\n';
2060             else
2061               *dst++ = c;
2062           }
2063         produced = dst - destination;
2064         *consumed = src - source;
2065         break;
2066       }
2067
2068     default:                    /* i.e. case CODING_EOL_CR: */
2069       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2070       bcopy (source, destination, produced);
2071       {
2072         int i = produced;
2073         while (i--)
2074           if (*dst++ == '\n') dst[-1] = '\r';
2075       }
2076       *consumed = produced;
2077     }
2078
2079   return produced;
2080 }
2081
2082 \f
2083 /*** 6. C library functions ***/
2084
2085 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2086    has a property `coding-system'.  The value of this property is a
2087    vector of length 5 (called as coding-vector).  Among elements of
2088    this vector, the first (element[0]) and the fifth (element[4])
2089    carry important information for decoding/encoding.  Before
2090    decoding/encoding, this information should be set in fields of a
2091    structure of type `coding_system'.
2092
2093    A value of property `coding-system' can be a symbol of another
2094    subsidiary coding-system.  In that case, Emacs gets coding-vector
2095    from that symbol.
2096
2097    `element[0]' contains information to be set in `coding->type'.  The
2098    value and its meaning is as follows:
2099
2100    0 -- coding_type_emacs_mule
2101    1 -- coding_type_sjis
2102    2 -- coding_type_iso2022
2103    3 -- coding_type_big5
2104    4 -- coding_type_ccl encoder/decoder written in CCL
2105    nil -- coding_type_no_conversion
2106    t -- coding_type_undecided (automatic conversion on decoding,
2107                                no-conversion on encoding)
2108
2109    `element[4]' contains information to be set in `coding->flags' and
2110    `coding->spec'.  The meaning varies by `coding->type'.
2111
2112    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2113    of length 32 (of which the first 13 sub-elements are used now).
2114    Meanings of these sub-elements are:
2115
2116    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2117         If the value is an integer of valid charset, the charset is
2118         assumed to be designated to graphic register N initially.
2119
2120         If the value is minus, it is a minus value of charset which
2121         reserves graphic register N, which means that the charset is
2122         not designated initially but should be designated to graphic
2123         register N just before encoding a character in that charset.
2124
2125         If the value is nil, graphic register N is never used on
2126         encoding.
2127
2128    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2129         Each value takes t or nil.  See the section ISO2022 of
2130         `coding.h' for more information.
2131
2132    If `coding->type' is `coding_type_big5', element[4] is t to denote
2133    BIG5-ETen or nil to denote BIG5-HKU.
2134
2135    If `coding->type' takes the other value, element[4] is ignored.
2136
2137    Emacs Lisp's coding system also carries information about format of
2138    end-of-line in a value of property `eol-type'.  If the value is
2139    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2140    means CODING_EOL_CR.  If it is not integer, it should be a vector
2141    of subsidiary coding systems of which property `eol-type' has one
2142    of above values.
2143
2144 */
2145
2146 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2147    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2148    is setup so that no conversion is necessary and return -1, else
2149    return 0.  */
2150
2151 int
2152 setup_coding_system (coding_system, coding)
2153      Lisp_Object coding_system;
2154      struct coding_system *coding;
2155 {
2156   Lisp_Object type, eol_type;
2157
2158   /* At first, set several fields default values.  */
2159   coding->require_flushing = 0;
2160   coding->last_block = 0;
2161   coding->selective = 0;
2162   coding->composing = 0;
2163   coding->direction = 0;
2164   coding->carryover_size = 0;
2165   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2166   coding->character_unification_table_for_decode = Qnil;
2167   coding->character_unification_table_for_encode = Qnil;
2168
2169   Vlast_coding_system_used = coding->symbol = coding_system;
2170   eol_type = Qnil;
2171   /* Get value of property `coding-system' until we get a vector.
2172      While doing that, also get values of properties
2173      `post-read-conversion', `pre-write-conversion',
2174      `character-unification-table-for-decode',
2175      `character-unification-table-for-encode' and `eol-type'.  */
2176   while (!NILP (coding_system) && SYMBOLP (coding_system))
2177     {
2178       if (NILP (coding->post_read_conversion))
2179         coding->post_read_conversion = Fget (coding_system,
2180                                              Qpost_read_conversion);
2181       if (NILP (coding->pre_write_conversion))
2182         coding->pre_write_conversion = Fget (coding_system,
2183                                              Qpre_write_conversion);
2184       if (NILP (eol_type))
2185         eol_type = Fget (coding_system, Qeol_type);
2186
2187       if (NILP (coding->character_unification_table_for_decode))
2188         coding->character_unification_table_for_decode
2189           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2190
2191       if (NILP (coding->character_unification_table_for_encode))
2192         coding->character_unification_table_for_encode
2193           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2194
2195       coding_system = Fget (coding_system, Qcoding_system);
2196     }
2197
2198   while (!NILP (coding->character_unification_table_for_decode)
2199          && SYMBOLP (coding->character_unification_table_for_decode))
2200         coding->character_unification_table_for_decode
2201           = Fget (coding->character_unification_table_for_decode,
2202                   Qcharacter_unification_table_for_decode);
2203   if (!NILP (coding->character_unification_table_for_decode)
2204       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2205       coding->character_unification_table_for_decode = Qnil;
2206
2207   while (!NILP (coding->character_unification_table_for_encode)
2208          && SYMBOLP (coding->character_unification_table_for_encode))
2209         coding->character_unification_table_for_encode
2210           = Fget (coding->character_unification_table_for_encode,
2211                   Qcharacter_unification_table_for_encode);
2212   if (!NILP (coding->character_unification_table_for_encode)
2213       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2214       coding->character_unification_table_for_encode = Qnil;
2215
2216   if (!VECTORP (coding_system)
2217       || XVECTOR (coding_system)->size != 5)
2218     goto label_invalid_coding_system;
2219
2220   if (VECTORP (eol_type))
2221     coding->eol_type = CODING_EOL_UNDECIDED;
2222   else if (XFASTINT (eol_type) == 1)
2223     coding->eol_type = CODING_EOL_CRLF;
2224   else if (XFASTINT (eol_type) == 2)
2225     coding->eol_type = CODING_EOL_CR;
2226   else
2227     coding->eol_type = CODING_EOL_LF;
2228
2229   type = XVECTOR (coding_system)->contents[0];
2230   switch (XFASTINT (type))
2231     {
2232     case 0:
2233       coding->type = coding_type_emacs_mule;
2234       break;
2235
2236     case 1:
2237       coding->type = coding_type_sjis;
2238       break;
2239
2240     case 2:
2241       coding->type = coding_type_iso2022;
2242       {
2243         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2244         Lisp_Object *flags;
2245         int i, charset, default_reg_bits = 0;
2246
2247         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2248           goto label_invalid_coding_system;
2249
2250         flags = XVECTOR (val)->contents;
2251         coding->flags
2252           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2253              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2254              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2255              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2256              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2257              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2258              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2259              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2260              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2261              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2262              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2263
2264         /* Invoke graphic register 0 to plane 0.  */
2265         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2266         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2267         CODING_SPEC_ISO_INVOCATION (coding, 1)
2268           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2269         /* Not single shifting at first.  */
2270         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2271         /* Beginning of buffer should also be regarded as bol. */
2272         CODING_SPEC_ISO_BOL(coding) = 1;
2273
2274         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2275            FLAGS[REG] can be one of below:
2276                 integer CHARSET: CHARSET occupies register I,
2277                 t: designate nothing to REG initially, but can be used
2278                   by any charsets,
2279                 list of integer, nil, or t: designate the first
2280                   element (if integer) to REG initially, the remaining
2281                   elements (if integer) is designated to REG on request,
2282                   if an element is t, REG can be used by any charset,
2283                 nil: REG is never used.  */
2284         for (charset = 0; charset <= MAX_CHARSET; charset++)
2285           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2286             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2287         for (i = 0; i < 4; i++)
2288           {
2289             if (INTEGERP (flags[i])
2290                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2291                 || (charset = get_charset_id (flags[i])) >= 0)
2292               {
2293                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2294                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2295               }
2296             else if (EQ (flags[i], Qt))
2297               {
2298                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2299                 default_reg_bits |= 1 << i;
2300               }
2301             else if (CONSP (flags[i]))
2302               {
2303                 Lisp_Object tail = flags[i];
2304
2305                 if (INTEGERP (XCONS (tail)->car)
2306                     && (charset = XINT (XCONS (tail)->car),
2307                         CHARSET_VALID_P (charset))
2308                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2309                   {
2310                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2311                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2312                   }
2313                 else
2314                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2315                 tail = XCONS (tail)->cdr;
2316                 while (CONSP (tail))
2317                   {
2318                     if (INTEGERP (XCONS (tail)->car)
2319                         && (charset = XINT (XCONS (tail)->car),
2320                             CHARSET_VALID_P (charset))
2321                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2322                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2323                         = i;
2324                     else if (EQ (XCONS (tail)->car, Qt))
2325                       default_reg_bits |= 1 << i;
2326                     tail = XCONS (tail)->cdr;
2327                   }
2328               }
2329             else
2330               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2331
2332             CODING_SPEC_ISO_DESIGNATION (coding, i)
2333               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2334           }
2335
2336         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2337           {
2338             /* REG 1 can be used only by locking shift in 7-bit env.  */
2339             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2340               default_reg_bits &= ~2;
2341             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2342               /* Without any shifting, only REG 0 and 1 can be used.  */
2343               default_reg_bits &= 3;
2344           }
2345
2346         for (charset = 0; charset <= MAX_CHARSET; charset++)
2347           if (CHARSET_VALID_P (charset)
2348               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2349                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2350             {
2351               /* We have not yet decided where to designate CHARSET.  */
2352               int reg_bits = default_reg_bits;
2353
2354               if (CHARSET_CHARS (charset) == 96)
2355                 /* A charset of CHARS96 can't be designated to REG 0.  */
2356                 reg_bits &= ~1;
2357
2358               if (reg_bits)
2359                 /* There exist some default graphic register.  */
2360                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2361                   = (reg_bits & 1
2362                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2363               else
2364                 /* We anyway have to designate CHARSET to somewhere.  */
2365                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2366                   = (CHARSET_CHARS (charset) == 94
2367                      ? 0
2368                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2369                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2370                         ? 1
2371                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2372                            ? 2 : 0)));
2373             }
2374       }
2375       coding->require_flushing = 1;
2376       break;
2377
2378     case 3:
2379       coding->type = coding_type_big5;
2380       coding->flags
2381         = (NILP (XVECTOR (coding_system)->contents[4])
2382            ? CODING_FLAG_BIG5_HKU
2383            : CODING_FLAG_BIG5_ETEN);
2384       break;
2385
2386     case 4:
2387       coding->type = coding_type_ccl;
2388       {
2389         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2390         if (CONSP  (val)
2391             && VECTORP (XCONS (val)->car)
2392             && VECTORP (XCONS (val)->cdr))
2393           {
2394             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2395             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2396           }
2397         else
2398           goto label_invalid_coding_system;
2399       }
2400       coding->require_flushing = 1;
2401       break;
2402
2403     default:
2404       if (EQ (type, Qt))
2405         coding->type = coding_type_undecided;
2406       else
2407         coding->type = coding_type_no_conversion;
2408       break;
2409     }
2410   return 0;
2411
2412  label_invalid_coding_system:
2413   coding->type = coding_type_no_conversion;
2414   coding->eol_type = CODING_EOL_LF;
2415   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2416     = Qnil;
2417   return -1;
2418 }
2419
2420 /* Emacs has a mechanism to automatically detect a coding system if it
2421    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2422    it's impossible to distinguish some coding systems accurately
2423    because they use the same range of codes.  So, at first, coding
2424    systems are categorized into 7, those are:
2425
2426    o coding-category-emacs-mule
2427
2428         The category for a coding system which has the same code range
2429         as Emacs' internal format.  Assigned the coding-system (Lisp
2430         symbol) `emacs-mule' by default.
2431
2432    o coding-category-sjis
2433
2434         The category for a coding system which has the same code range
2435         as SJIS.  Assigned the coding-system (Lisp
2436         symbol) `shift-jis' by default.
2437
2438    o coding-category-iso-7
2439
2440         The category for a coding system which has the same code range
2441         as ISO2022 of 7-bit environment.  Assigned the coding-system
2442         (Lisp symbol) `iso-2022-7' by default.
2443
2444    o coding-category-iso-8-1
2445
2446         The category for a coding system which has the same code range
2447         as ISO2022 of 8-bit environment and graphic plane 1 used only
2448         for DIMENSION1 charset.  Assigned the coding-system (Lisp
2449         symbol) `iso-8859-1' by default.
2450
2451    o coding-category-iso-8-2
2452
2453         The category for a coding system which has the same code range
2454         as ISO2022 of 8-bit environment and graphic plane 1 used only
2455         for DIMENSION2 charset.  Assigned the coding-system (Lisp
2456         symbol) `euc-japan' by default.
2457
2458    o coding-category-iso-else
2459
2460         The category for a coding system which has the same code range
2461         as ISO2022 but not belongs to any of the above three
2462         categories.  Assigned the coding-system (Lisp symbol)
2463         `iso-2022-ss2-7' by default.
2464
2465    o coding-category-big5
2466
2467         The category for a coding system which has the same code range
2468         as BIG5.  Assigned the coding-system (Lisp symbol)
2469         `cn-big5' by default.
2470
2471    o coding-category-binary
2472
2473         The category for a coding system not categorized in any of the
2474         above.  Assigned the coding-system (Lisp symbol)
2475         `no-conversion' by default.
2476
2477    Each of them is a Lisp symbol and the value is an actual
2478    `coding-system's (this is also a Lisp symbol) assigned by a user.
2479    What Emacs does actually is to detect a category of coding system.
2480    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2481    decide only one possible category, it selects a category of the
2482    highest priority.  Priorities of categories are also specified by a
2483    user in a Lisp variable `coding-category-list'.
2484
2485 */
2486
2487 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2488    If it detects possible coding systems, return an integer in which
2489    appropriate flag bits are set.  Flag bits are defined by macros
2490    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2491
2492 int
2493 detect_coding_mask (src, src_bytes)
2494      unsigned char *src;
2495      int src_bytes;
2496 {
2497   register unsigned char c;
2498   unsigned char *src_end = src + src_bytes;
2499   int mask;
2500
2501   /* At first, skip all ASCII characters and control characters except
2502      for three ISO2022 specific control characters.  */
2503  label_loop_detect_coding:
2504   while (src < src_end)
2505     {
2506       c = *src;
2507       if (c >= 0x80
2508           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2509         break;
2510       src++;
2511     }
2512
2513   if (src >= src_end)
2514     /* We found nothing other than ASCII.  There's nothing to do.  */
2515     return CODING_CATEGORY_MASK_ANY;
2516
2517   /* The text seems to be encoded in some multilingual coding system.
2518      Now, try to find in which coding system the text is encoded.  */
2519   if (c < 0x80)
2520     {
2521       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2522       /* C is an ISO2022 specific control code of C0.  */
2523       mask = detect_coding_iso2022 (src, src_end);
2524       src++;
2525       if (mask == CODING_CATEGORY_MASK_ANY)
2526         /* No valid ISO2022 code follows C.  Try again.  */
2527         goto label_loop_detect_coding;
2528     }
2529   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2530     /* C is an ISO2022 specific control code of C1,
2531        or the first byte of SJIS's 2-byte character code,
2532        or a leading code of Emacs.  */
2533     mask = (detect_coding_iso2022 (src, src_end)
2534             | detect_coding_sjis (src, src_end)
2535             | detect_coding_emacs_mule (src, src_end));
2536
2537   else if (c < 0xA0)
2538     /* C is the first byte of SJIS character code,
2539        or a leading-code of Emacs.  */
2540     mask = (detect_coding_sjis (src, src_end)
2541             | detect_coding_emacs_mule (src, src_end));
2542
2543   else
2544     /* C is a character of ISO2022 in graphic plane right,
2545        or a SJIS's 1-byte character code (i.e. JISX0201),
2546        or the first byte of BIG5's 2-byte code.  */
2547     mask = (detect_coding_iso2022 (src, src_end)
2548             | detect_coding_sjis (src, src_end)
2549             | detect_coding_big5 (src, src_end));
2550
2551   return mask;
2552 }
2553
2554 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2555    The information of the detected coding system is set in CODING.  */
2556
2557 void
2558 detect_coding (coding, src, src_bytes)
2559      struct coding_system *coding;
2560      unsigned char *src;
2561      int src_bytes;
2562 {
2563   int mask = detect_coding_mask (src, src_bytes);
2564   int idx;
2565
2566   if (mask == CODING_CATEGORY_MASK_ANY)
2567     /* We found nothing other than ASCII.  There's nothing to do.  */
2568     return;
2569
2570   if (!mask)
2571     /* The source text seems to be encoded in unknown coding system.
2572        Emacs regards the category of such a kind of coding system as
2573        `coding-category-binary'.  We assume that a user has assigned
2574        an appropriate coding system for a `coding-category-binary'.  */
2575     idx = CODING_CATEGORY_IDX_BINARY;
2576   else
2577     {
2578       /* We found some plausible coding systems.  Let's use a coding
2579          system of the highest priority.  */
2580       Lisp_Object val = Vcoding_category_list;
2581
2582       if (CONSP (val))
2583         while (!NILP (val))
2584           {
2585             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2586             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2587               break;
2588             val = XCONS (val)->cdr;
2589           }
2590       else
2591         val = Qnil;
2592
2593       if (NILP (val))
2594         {
2595           /* For unknown reason, `Vcoding_category_list' contains none
2596              of found categories.  Let's use any of them.  */
2597           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2598             if (mask & (1 << idx))
2599               break;
2600         }
2601     }
2602   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2603 }
2604
2605 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2606    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2607    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2608
2609 int
2610 detect_eol_type (src, src_bytes)
2611      unsigned char *src;
2612      int src_bytes;
2613 {
2614   unsigned char *src_end = src + src_bytes;
2615   unsigned char c;
2616
2617   while (src < src_end)
2618     {
2619       c = *src++;
2620       if (c == '\n')
2621         return CODING_EOL_LF;
2622       else if (c == '\r')
2623         {
2624           if (src < src_end && *src == '\n')
2625             return CODING_EOL_CRLF;
2626           else
2627             return CODING_EOL_CR;
2628         }
2629     }
2630   return CODING_EOL_UNDECIDED;
2631 }
2632
2633 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2634    is encoded.  If it detects an appropriate format of end-of-line, it
2635    sets the information in *CODING.  */
2636
2637 void
2638 detect_eol (coding, src, src_bytes)
2639      struct coding_system *coding;
2640      unsigned char *src;
2641      int src_bytes;
2642 {
2643   Lisp_Object val;
2644   int eol_type = detect_eol_type (src, src_bytes);
2645
2646   if (eol_type == CODING_EOL_UNDECIDED)
2647     /*  We found no end-of-line in the source text.  */
2648     return;
2649
2650   val = Fget (coding->symbol, Qeol_type);
2651   if (VECTORP (val) && XVECTOR (val)->size == 3)
2652     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2653 }
2654
2655 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2656    decoding, it may detect coding system and format of end-of-line if
2657    those are not yet decided.  */
2658
2659 int
2660 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2661      struct coding_system *coding;
2662      unsigned char *source, *destination;
2663      int src_bytes, dst_bytes;
2664      int *consumed;
2665 {
2666   int produced;
2667
2668   if (src_bytes <= 0)
2669     {
2670       *consumed = 0;
2671       return 0;
2672     }
2673
2674   if (coding->type == coding_type_undecided)
2675     detect_coding (coding, source, src_bytes);
2676
2677   if (coding->eol_type == CODING_EOL_UNDECIDED)
2678     detect_eol (coding, source, src_bytes);
2679
2680   coding->carryover_size = 0;
2681   switch (coding->type)
2682     {
2683     case coding_type_no_conversion:
2684     label_no_conversion:
2685       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2686       bcopy (source, destination, produced);
2687       *consumed = produced;
2688       break;
2689
2690     case coding_type_emacs_mule:
2691     case coding_type_undecided:
2692       if (coding->eol_type == CODING_EOL_LF
2693           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2694         goto label_no_conversion;
2695       produced = decode_eol (coding, source, destination,
2696                              src_bytes, dst_bytes, consumed);
2697       break;
2698
2699     case coding_type_sjis:
2700       produced = decode_coding_sjis_big5 (coding, source, destination,
2701                                           src_bytes, dst_bytes, consumed,
2702                                           1);
2703       break;
2704
2705     case coding_type_iso2022:
2706       produced = decode_coding_iso2022 (coding, source, destination,
2707                                         src_bytes, dst_bytes, consumed);
2708       break;
2709
2710     case coding_type_big5:
2711       produced = decode_coding_sjis_big5 (coding, source, destination,
2712                                           src_bytes, dst_bytes, consumed,
2713                                           0);
2714       break;
2715
2716     case coding_type_ccl:
2717       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2718                              src_bytes, dst_bytes, consumed);
2719       break;
2720     }
2721
2722   return produced;
2723 }
2724
2725 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2726
2727 int
2728 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2729      struct coding_system *coding;
2730      unsigned char *source, *destination;
2731      int src_bytes, dst_bytes;
2732      int *consumed;
2733 {
2734   int produced;
2735
2736   coding->carryover_size = 0;
2737   switch (coding->type)
2738     {
2739     case coding_type_no_conversion:
2740     label_no_conversion:
2741       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2742       if (produced > 0)
2743         {
2744           bcopy (source, destination, produced);
2745           if (coding->selective)
2746             {
2747               unsigned char *p = destination, *pend = destination + produced;
2748               while (p < pend)
2749                 if (*p++ == '\015') p[-1] = '\n';
2750             }
2751         }
2752       *consumed = produced;
2753       break;
2754
2755     case coding_type_emacs_mule:
2756     case coding_type_undecided:
2757       if (coding->eol_type == CODING_EOL_LF
2758           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2759         goto label_no_conversion;
2760       produced = encode_eol (coding, source, destination,
2761                              src_bytes, dst_bytes, consumed);
2762       break;
2763
2764     case coding_type_sjis:
2765       produced = encode_coding_sjis_big5 (coding, source, destination,
2766                                           src_bytes, dst_bytes, consumed,
2767                                           1);
2768       break;
2769
2770     case coding_type_iso2022:
2771       produced = encode_coding_iso2022 (coding, source, destination,
2772                                         src_bytes, dst_bytes, consumed);
2773       break;
2774
2775     case coding_type_big5:
2776       produced = encode_coding_sjis_big5 (coding, source, destination,
2777                                           src_bytes, dst_bytes, consumed,
2778                                           0);
2779       break;
2780
2781     case coding_type_ccl:
2782       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2783                              src_bytes, dst_bytes, consumed);
2784       break;
2785     }
2786
2787   return produced;
2788 }
2789
2790 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2791
2792 /* Return maximum size (bytes) of a buffer enough for decoding
2793    SRC_BYTES of text encoded in CODING.  */
2794
2795 int
2796 decoding_buffer_size (coding, src_bytes)
2797      struct coding_system *coding;
2798      int src_bytes;
2799 {
2800   int magnification;
2801
2802   if (coding->type == coding_type_iso2022)
2803     magnification = 3;
2804   else if (coding->type == coding_type_ccl)
2805     magnification = coding->spec.ccl.decoder.buf_magnification;
2806   else
2807     magnification = 2;
2808
2809   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2810 }
2811
2812 /* Return maximum size (bytes) of a buffer enough for encoding
2813    SRC_BYTES of text to CODING.  */
2814
2815 int
2816 encoding_buffer_size (coding, src_bytes)
2817      struct coding_system *coding;
2818      int src_bytes;
2819 {
2820   int magnification;
2821
2822   if (coding->type == coding_type_ccl)
2823     magnification = coding->spec.ccl.encoder.buf_magnification;
2824   else
2825     magnification = 3;
2826
2827   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2828 }
2829
2830 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2831 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2832 #endif
2833
2834 char *conversion_buffer;
2835 int conversion_buffer_size;
2836
2837 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2838    or decoding.  Sufficient memory is allocated automatically.  If we
2839    run out of memory, return NULL.  */
2840
2841 char *
2842 get_conversion_buffer (size)
2843      int size;
2844 {
2845   if (size > conversion_buffer_size)
2846     {
2847       char *buf;
2848       int real_size = conversion_buffer_size * 2;
2849
2850       while (real_size < size) real_size *= 2;
2851       buf = (char *) xmalloc (real_size);
2852       xfree (conversion_buffer);
2853       conversion_buffer = buf;
2854       conversion_buffer_size = real_size;
2855     }
2856   return conversion_buffer;
2857 }
2858
2859 \f
2860 #ifdef emacs
2861 /*** 7. Emacs Lisp library functions ***/
2862
2863 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
2864        1, 1, 0,
2865   "Return coding-spec of CODING-SYSTEM.\n\
2866 If CODING-SYSTEM is not a valid coding-system, return nil.")
2867   (obj)
2868      Lisp_Object obj;
2869 {
2870   while (SYMBOLP (obj) && !NILP (obj))
2871     obj = Fget (obj, Qcoding_system);
2872   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2873           ? Qnil : obj);
2874 }
2875
2876 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2877   "Return t if OBJECT is nil or a coding-system.\n\
2878 See document of make-coding-system for coding-system object.")
2879   (obj)
2880      Lisp_Object obj;
2881 {
2882   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
2883 }
2884
2885 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2886        Sread_non_nil_coding_system, 1, 1, 0,
2887   "Read a coding system from the minibuffer, prompting with string PROMPT.")
2888   (prompt)
2889      Lisp_Object prompt;
2890 {
2891   Lisp_Object val;
2892   do
2893     {
2894       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
2895                               Qt, Qnil, Qnil, Qnil);
2896     }
2897   while (XSTRING (val)->size == 0);
2898   return (Fintern (val, Qnil));
2899 }
2900
2901 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2902   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2903   (prompt)
2904      Lisp_Object prompt;
2905 {
2906   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2907                                       Qt, Qnil, Qnil, Qnil);
2908   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2909 }
2910
2911 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2912        1, 1, 0,
2913   "Check validity of CODING-SYSTEM.\n\
2914 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2915 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2916 The value of property should be a vector of length 5.")
2917   (coding_system)
2918      Lisp_Object coding_system;
2919 {
2920   CHECK_SYMBOL (coding_system, 0);
2921   if (!NILP (Fcoding_system_p (coding_system)))
2922     return coding_system;
2923   while (1)
2924     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
2925 }
2926
2927 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2928        2, 2, 0,
2929   "Detect coding-system of the text in the region between START and END.\n\
2930 Return a list of possible coding-systems ordered by priority.\n\
2931 If only ASCII characters are found, it returns `undecided'\n\
2932  or its subsidiary coding-system according to a detected end-of-line format.")
2933   (b, e)
2934      Lisp_Object b, e;
2935 {
2936   int coding_mask, eol_type;
2937   Lisp_Object val;
2938   int beg, end;
2939
2940   validate_region (&b, &e);
2941   beg = XINT (b), end = XINT (e);
2942   if (beg < GPT && end >= GPT) move_gap (end);
2943
2944   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2945   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
2946
2947   if (coding_mask == CODING_CATEGORY_MASK_ANY)
2948     {
2949       val = intern ("undecided");
2950       if (eol_type != CODING_EOL_UNDECIDED)
2951         {
2952           Lisp_Object val2 = Fget (val, Qeol_type);
2953           if (VECTORP (val2))
2954             val = XVECTOR (val2)->contents[eol_type];
2955         }
2956     }
2957   else
2958     {
2959       Lisp_Object val2;
2960
2961       /* At first, gather possible coding-systems in VAL in a reverse
2962          order.  */
2963       val = Qnil;
2964       for (val2 = Vcoding_category_list;
2965            !NILP (val2);
2966            val2 = XCONS (val2)->cdr)
2967         {
2968           int idx
2969             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2970           if (coding_mask & (1 << idx))
2971             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2972         }
2973
2974       /* Then, change the order of the list, while getting subsidiary
2975          coding-systems.  */
2976       val2 = val;
2977       val = Qnil;
2978       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2979         {
2980           if (eol_type == CODING_EOL_UNDECIDED)
2981             val = Fcons (XCONS (val2)->car, val);
2982           else
2983             {
2984               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2985               if (VECTORP (val3))
2986                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2987               else
2988                 val = Fcons (XCONS (val2)->car, val);
2989             }
2990         }
2991     }
2992
2993   return val;
2994 }
2995
2996 /* Scan text in the region between *BEGP and *ENDP, skip characters
2997    which we never have to encode to (iff ENCODEP is 1) or decode from
2998    coding system CODING at the head and tail, then set BEGP and ENDP
2999    to the addresses of start and end of the text we actually convert.  */
3000
3001 void
3002 shrink_conversion_area (begp, endp, coding, encodep)
3003      unsigned char **begp, **endp;
3004      struct coding_system *coding;
3005      int encodep;
3006 {
3007   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3008
3009   if (coding->eol_type != CODING_EOL_LF
3010       && coding->eol_type != CODING_EOL_UNDECIDED)
3011     /* Since we anyway have to convert end-of-line format, it is not
3012        worth skipping at most 100 bytes or so.  */
3013     return;
3014
3015   if (encodep)                  /* for encoding */
3016     {
3017       switch (coding->type)
3018         {
3019         case coding_type_no_conversion:
3020         case coding_type_emacs_mule:
3021         case coding_type_undecided:
3022           /* We need no conversion.  */
3023           *begp = *endp;
3024           return;
3025         case coding_type_ccl:
3026           /* We can't skip any data.  */
3027           return;
3028         case coding_type_iso2022:
3029           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3030             {
3031               unsigned char *bol = beg_addr;
3032               while (beg_addr < end_addr && *beg_addr < 0x80)
3033                 {
3034                   beg_addr++;
3035                   if (*(beg_addr - 1) == '\n')
3036                     bol = beg_addr;
3037                 }
3038               beg_addr = bol;
3039               goto label_skip_tail;
3040             }
3041           /* fall down ... */
3042         default:
3043           /* We can skip all ASCII characters at the head and tail.  */
3044           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3045         label_skip_tail:
3046           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3047           break;
3048         }
3049     }
3050   else                          /* for decoding */
3051     {
3052       switch (coding->type)
3053         {
3054         case coding_type_no_conversion:
3055           /* We need no conversion.  */
3056           *begp = *endp;
3057           return;
3058         case coding_type_emacs_mule:
3059           if (coding->eol_type == CODING_EOL_LF)
3060             {
3061               /* We need no conversion.  */
3062               *begp = *endp;
3063               return;
3064             }
3065           /* We can skip all but carriage-return.  */
3066           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3067           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3068           break;
3069         case coding_type_sjis:
3070         case coding_type_big5:
3071           /* We can skip all ASCII characters at the head.  */
3072           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3073           /* We can skip all ASCII characters at the tail except for
3074              the second byte of SJIS or BIG5 code.  */
3075           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3076           if (end_addr != *endp)
3077             end_addr++;
3078           break;
3079         case coding_type_ccl:
3080           /* We can't skip any data.  */
3081           return;
3082         default:                /* i.e. case coding_type_iso2022: */
3083           {
3084             unsigned char c;
3085
3086             /* We can skip all ASCII characters except for a few
3087                control codes at the head.  */
3088             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3089                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3090                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3091               beg_addr++;
3092           }
3093           break;
3094         }
3095     }
3096   *begp = beg_addr;
3097   *endp = end_addr;
3098   return;
3099 }
3100
3101 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3102    text between B and E.  B and E are buffer position.  */
3103
3104 Lisp_Object
3105 code_convert_region (b, e, coding, encodep)
3106      Lisp_Object b, e;
3107      struct coding_system *coding;
3108      int encodep;
3109 {
3110   int beg, end, len, consumed, produced;
3111   char *buf;
3112   unsigned char *begp, *endp;
3113   int pos = PT;
3114
3115   validate_region (&b, &e);
3116   beg = XINT (b), end = XINT (e);
3117   if (beg < GPT && end >= GPT)
3118     move_gap (end);
3119
3120   if (encodep && !NILP (coding->pre_write_conversion))
3121     {
3122       /* We must call a pre-conversion function which may put a new
3123          text to be converted in a new buffer.  */
3124       struct buffer *old = current_buffer, *new;
3125
3126       TEMP_SET_PT (beg);
3127       call2 (coding->pre_write_conversion, b, e);
3128       if (old != current_buffer)
3129         {
3130           /* Replace the original text by the text just generated.  */
3131           len = ZV - BEGV;
3132           new = current_buffer;
3133           set_buffer_internal (old);
3134           del_range (beg, end);
3135           insert_from_buffer (new, 1, len, 0);
3136           end = beg + len;
3137         }
3138     }
3139
3140   /* We may be able to shrink the conversion region.  */
3141   begp = POS_ADDR (beg); endp = begp + (end - beg);
3142   shrink_conversion_area (&begp, &endp, coding, encodep);
3143
3144   if (begp == endp)
3145     /* We need no conversion.  */
3146     len = end - beg;
3147   else
3148     {
3149       beg += begp - POS_ADDR (beg);
3150       end =  beg + (endp - begp);
3151
3152       if (encodep)
3153         len = encoding_buffer_size (coding, end - beg);
3154       else
3155         len = decoding_buffer_size (coding, end - beg);
3156       buf = get_conversion_buffer (len);
3157
3158       coding->last_block = 1;
3159       produced = (encodep
3160                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3161                                    &consumed)
3162                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3163                                    &consumed));
3164
3165       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3166
3167       TEMP_SET_PT (beg);
3168       insert (buf, produced);
3169       del_range (PT, PT + end - beg);
3170       if (pos >= end)
3171         pos = PT + (pos - end);
3172       else if (pos > beg)
3173         pos = beg;
3174       TEMP_SET_PT (pos);
3175   }
3176
3177   if (!encodep && !NILP (coding->post_read_conversion))
3178     {
3179       /* We must call a post-conversion function which may alter
3180          the text just converted.  */
3181       Lisp_Object insval;
3182
3183       beg = XINT (b);
3184       TEMP_SET_PT (beg);
3185       insval = call1 (coding->post_read_conversion, make_number (len));
3186       CHECK_NUMBER (insval, 0);
3187       len = XINT (insval);
3188     }
3189
3190   return make_number (len);
3191 }
3192
3193 Lisp_Object
3194 code_convert_string (str, coding, encodep, nocopy)
3195      Lisp_Object str, nocopy;
3196      struct coding_system *coding;
3197      int encodep;
3198 {
3199   int len, consumed, produced;
3200   char *buf;
3201   unsigned char *begp, *endp;
3202   int head_skip, tail_skip;
3203   struct gcpro gcpro1;
3204
3205   if (encodep && !NILP (coding->pre_write_conversion)
3206       || !encodep && !NILP (coding->post_read_conversion))
3207     {
3208       /* Since we have to call Lisp functions which assume target text
3209          is in a buffer, after setting a temporary buffer, call
3210          code_convert_region.  */
3211       int count = specpdl_ptr - specpdl;
3212       int len = XSTRING (str)->size;
3213       Lisp_Object result;
3214       struct buffer *old = current_buffer;
3215
3216       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3217       temp_output_buffer_setup (" *code-converting-work*");
3218       set_buffer_internal (XBUFFER (Vstandard_output));
3219       insert_from_string (str, 0, len, 0);
3220       code_convert_region (make_number (BEGV), make_number (ZV),
3221                            coding, encodep);
3222       result = make_buffer_string (BEGV, ZV, 0);
3223       set_buffer_internal (old);
3224       return unbind_to (count, result);
3225     }
3226
3227   /* We may be able to shrink the conversion region.  */
3228   begp = XSTRING (str)->data;
3229   endp = begp + XSTRING (str)->size;
3230   shrink_conversion_area (&begp, &endp, coding, encodep);
3231
3232   if (begp == endp)
3233     /* We need no conversion.  */
3234     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3235
3236   head_skip = begp - XSTRING (str)->data;
3237   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3238
3239   GCPRO1 (str);
3240
3241   if (encodep)
3242     len = encoding_buffer_size (coding, endp - begp);
3243   else
3244     len = decoding_buffer_size (coding, endp - begp);
3245   buf = get_conversion_buffer (len + head_skip + tail_skip);
3246
3247   bcopy (XSTRING (str)->data, buf, head_skip);
3248   coding->last_block = 1;
3249   produced = (encodep
3250               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3251                                buf + head_skip, endp - begp, len, &consumed)
3252               : decode_coding (coding, XSTRING (str)->data + head_skip,
3253                                buf + head_skip, endp - begp, len, &consumed));
3254   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3255          buf + head_skip + produced,
3256          tail_skip);
3257
3258   UNGCPRO;
3259
3260   return make_string (buf, head_skip + produced + tail_skip);
3261 }
3262
3263 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3264        3, 3, "r\nzCoding system: ",
3265   "Decode current region by specified coding system.\n\
3266 When called from a program, takes three arguments:\n\
3267 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3268 Return length of decoded text.")
3269   (b, e, coding_system)
3270      Lisp_Object b, e, coding_system;
3271 {
3272   struct coding_system coding;
3273
3274   CHECK_NUMBER_COERCE_MARKER (b, 0);
3275   CHECK_NUMBER_COERCE_MARKER (e, 1);
3276   CHECK_SYMBOL (coding_system, 2);
3277
3278   if (NILP (coding_system))
3279     return make_number (XFASTINT (e) - XFASTINT (b));
3280   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3281     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3282
3283   return code_convert_region (b, e, &coding, 0);
3284 }
3285
3286 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3287        3, 3, "r\nzCoding system: ",
3288   "Encode current region by specified coding system.\n\
3289 When called from a program, takes three arguments:\n\
3290 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3291 Return length of encoded text.")
3292   (b, e, coding_system)
3293      Lisp_Object b, e, coding_system;
3294 {
3295   struct coding_system coding;
3296
3297   CHECK_NUMBER_COERCE_MARKER (b, 0);
3298   CHECK_NUMBER_COERCE_MARKER (e, 1);
3299   CHECK_SYMBOL (coding_system, 2);
3300
3301   if (NILP (coding_system))
3302     return make_number (XFASTINT (e) - XFASTINT (b));
3303   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3304     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3305
3306   return code_convert_region (b, e, &coding, 1);
3307 }
3308
3309 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3310        2, 3, 0,
3311   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3312 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3313 of decoding.")
3314   (string, coding_system, nocopy)
3315      Lisp_Object string, coding_system, nocopy;
3316 {
3317   struct coding_system coding;
3318
3319   CHECK_STRING (string, 0);
3320   CHECK_SYMBOL (coding_system, 1);
3321
3322   if (NILP (coding_system))
3323     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3324   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3325     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3326
3327   return code_convert_string (string, &coding, 0, nocopy);
3328 }
3329
3330 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3331        2, 3, 0,
3332   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3333 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3334 of encoding.")
3335   (string, coding_system, nocopy)
3336      Lisp_Object string, coding_system, nocopy;
3337 {
3338   struct coding_system coding;
3339
3340   CHECK_STRING (string, 0);
3341   CHECK_SYMBOL (coding_system, 1);
3342
3343   if (NILP (coding_system))
3344     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3345   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3346     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3347
3348   return code_convert_string (string, &coding, 1, nocopy);
3349 }
3350
3351 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3352   "Decode a JISX0208 character of shift-jis encoding.\n\
3353 CODE is the character code in SJIS.\n\
3354 Return the corresponding character.")
3355   (code)
3356      Lisp_Object code;
3357 {
3358   unsigned char c1, c2, s1, s2;
3359   Lisp_Object val;
3360
3361   CHECK_NUMBER (code, 0);
3362   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3363   DECODE_SJIS (s1, s2, c1, c2);
3364   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3365   return val;
3366 }
3367
3368 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3369   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3370 Return the corresponding character code in SJIS.")
3371   (ch)
3372      Lisp_Object ch;
3373 {
3374   int charset, c1, c2, s1, s2;
3375   Lisp_Object val;
3376
3377   CHECK_NUMBER (ch, 0);
3378   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3379   if (charset == charset_jisx0208)
3380     {
3381       ENCODE_SJIS (c1, c2, s1, s2);
3382       XSETFASTINT (val, (s1 << 8) | s2);
3383     }
3384   else
3385     XSETFASTINT (val, 0);
3386   return val;
3387 }
3388
3389 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3390   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3391 CODE is the character code in BIG5.\n\
3392 Return the corresponding character.")
3393   (code)
3394      Lisp_Object code;
3395 {
3396   int charset;
3397   unsigned char b1, b2, c1, c2;
3398   Lisp_Object val;
3399
3400   CHECK_NUMBER (code, 0);
3401   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3402   DECODE_BIG5 (b1, b2, charset, c1, c2);
3403   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3404   return val;
3405 }
3406
3407 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3408   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3409 Return the corresponding character code in Big5.")
3410   (ch)
3411      Lisp_Object ch;
3412 {
3413   int charset, c1, c2, b1, b2;
3414   Lisp_Object val;
3415
3416   CHECK_NUMBER (ch, 0);
3417   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3418   if (charset == charset_big5_1 || charset == charset_big5_2)
3419     {
3420       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3421       XSETFASTINT (val, (b1 << 8) | b2);
3422     }
3423   else
3424     XSETFASTINT (val, 0);
3425   return val;
3426 }
3427
3428 DEFUN ("set-terminal-coding-system-internal",
3429        Fset_terminal_coding_system_internal,
3430        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3431   (coding_system)
3432      Lisp_Object coding_system;
3433 {
3434   CHECK_SYMBOL (coding_system, 0);
3435   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3436   return Qnil;
3437 }
3438
3439 DEFUN ("terminal-coding-system",
3440        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3441   "Return coding-system of your terminal.")
3442   ()
3443 {
3444   return terminal_coding.symbol;
3445 }
3446
3447 DEFUN ("set-keyboard-coding-system-internal",
3448        Fset_keyboard_coding_system_internal,
3449        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3450   (coding_system)
3451      Lisp_Object coding_system;
3452 {
3453   CHECK_SYMBOL (coding_system, 0);
3454   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3455   return Qnil;
3456 }
3457
3458 DEFUN ("keyboard-coding-system",
3459        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3460   "Return coding-system of what is sent from terminal keyboard.")
3461   ()
3462 {
3463   return keyboard_coding.symbol;
3464 }
3465
3466 \f
3467 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3468        Sfind_operation_coding_system,  1, MANY, 0,
3469   "Choose a coding system for an operation based on the target name.\n\
3470 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3471 ENCODING-SYSTEM is the coding system to use for encoding\n\
3472 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3473 for decoding (in case OPERATION does decoding).\n\
3474 \n\
3475 The first argument OPERATION specifies an I/O primitive:\n\
3476   For file I/O, `insert-file-contents' or `write-region'.\n\
3477   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3478   For network I/O, `open-network-stream'.\n\
3479 \n\
3480 The remaining arguments should be the same arguments that were passed\n\
3481 to the primitive.  Depending on which primitive, one of those arguments\n\
3482 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3483 whichever argument specifies the file name is TARGET.\n\
3484 \n\
3485 TARGET has a meaning which depends on OPERATION:\n\
3486   For file I/O, TARGET is a file name.\n\
3487   For process I/O, TARGET is a process name.\n\
3488   For network I/O, TARGET is a service name or a port number\n\
3489 \n\
3490 This function looks up what specified for TARGET in,\n\
3491 `file-coding-system-alist', `process-coding-system-alist',\n\
3492 or `network-coding-system-alist' depending on OPERATION.\n\
3493 They may specify a coding system, a cons of coding systems,\n\
3494 or a function symbol to call.\n\
3495 In the last case, we call the function with one argument,\n\
3496 which is a list of all the arguments given to `find-coding-system'.")
3497   (nargs, args)
3498      int nargs;
3499      Lisp_Object *args;
3500 {
3501   Lisp_Object operation, target_idx, target, val;
3502   register Lisp_Object chain;
3503
3504   if (nargs < 2)
3505     error ("Too few arguments");
3506   operation = args[0];
3507   if (!SYMBOLP (operation)
3508       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3509     error ("Invalid first arguement");
3510   if (nargs < 1 + XINT (target_idx))
3511     error ("Too few arguments for operation: %s",
3512            XSYMBOL (operation)->name->data);
3513   target = args[XINT (target_idx) + 1];
3514   if (!(STRINGP (target)
3515         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3516     error ("Invalid %dth argument", XINT (target_idx) + 1);
3517
3518   chain = (operation == Qinsert_file_contents || operation == Qwrite_region
3519            ? Vfile_coding_system_alist
3520            : (operation == Qopen_network_stream
3521               ? Vnetwork_coding_system_alist
3522               : Vprocess_coding_system_alist));
3523   if (NILP (chain))
3524     return Qnil;
3525
3526   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3527     {
3528       Lisp_Object elt = XCONS (chain)->car;
3529
3530       if (CONSP (elt)
3531           && ((STRINGP (target)
3532                && STRINGP (XCONS (elt)->car)
3533                && fast_string_match (XCONS (elt)->car, target) >= 0)
3534               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3535         {
3536           val = XCONS (elt)->cdr;
3537           if (CONSP (val))
3538             return val;
3539           if (! SYMBOLP (val))
3540             return Qnil;
3541           if (! NILP (Fcoding_system_p (val)))
3542             return Fcons (val, val);
3543           if (!NILP (Fboundp (val)))
3544             return call2 (val, Flist (nargs, args));
3545           return Qnil;
3546         }
3547     }
3548   return Qnil;
3549 }
3550
3551 #endif /* emacs */
3552
3553 \f
3554 /*** 8. Post-amble ***/
3555
3556 init_coding_once ()
3557 {
3558   int i;
3559
3560   /* Emacs' internal format specific initialize routine.  */
3561   for (i = 0; i <= 0x20; i++)
3562     emacs_code_class[i] = EMACS_control_code;
3563   emacs_code_class[0x0A] = EMACS_linefeed_code;
3564   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3565   for (i = 0x21 ; i < 0x7F; i++)
3566     emacs_code_class[i] = EMACS_ascii_code;
3567   emacs_code_class[0x7F] = EMACS_control_code;
3568   emacs_code_class[0x80] = EMACS_leading_code_composition;
3569   for (i = 0x81; i < 0xFF; i++)
3570     emacs_code_class[i] = EMACS_invalid_code;
3571   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3572   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3573   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3574   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3575
3576   /* ISO2022 specific initialize routine.  */
3577   for (i = 0; i < 0x20; i++)
3578     iso_code_class[i] = ISO_control_code;
3579   for (i = 0x21; i < 0x7F; i++)
3580     iso_code_class[i] = ISO_graphic_plane_0;
3581   for (i = 0x80; i < 0xA0; i++)
3582     iso_code_class[i] = ISO_control_code;
3583   for (i = 0xA1; i < 0xFF; i++)
3584     iso_code_class[i] = ISO_graphic_plane_1;
3585   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3586   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3587   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3588   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3589   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3590   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3591   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3592   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3593   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3594   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3595
3596   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3597   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3598
3599   setup_coding_system (Qnil, &keyboard_coding);
3600   setup_coding_system (Qnil, &terminal_coding);
3601 }
3602
3603 #ifdef emacs
3604
3605 syms_of_coding ()
3606 {
3607   Qtarget_idx = intern ("target-idx");
3608   staticpro (&Qtarget_idx);
3609
3610   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3611   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3612
3613   Qcall_process = intern ("call-process");
3614   staticpro (&Qcall_process);
3615   Fput (Qcall_process, Qtarget_idx, make_number (0));
3616
3617   Qcall_process_region = intern ("call-process-region");
3618   staticpro (&Qcall_process_region);
3619   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3620
3621   Qstart_process = intern ("start-process");
3622   staticpro (&Qstart_process);
3623   Fput (Qstart_process, Qtarget_idx, make_number (2));
3624
3625   Qopen_network_stream = intern ("open-network-stream");
3626   staticpro (&Qopen_network_stream);
3627   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3628
3629   Qcoding_system = intern ("coding-system");
3630   staticpro (&Qcoding_system);
3631
3632   Qeol_type = intern ("eol-type");
3633   staticpro (&Qeol_type);
3634
3635   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3636   staticpro (&Qbuffer_file_coding_system);
3637
3638   Qpost_read_conversion = intern ("post-read-conversion");
3639   staticpro (&Qpost_read_conversion);
3640
3641   Qpre_write_conversion = intern ("pre-write-conversion");
3642   staticpro (&Qpre_write_conversion);
3643
3644   Qcoding_system_spec = intern ("coding-system-spec");
3645   staticpro (&Qcoding_system_spec);
3646
3647   Qcoding_system_p = intern ("coding-system-p");
3648   staticpro (&Qcoding_system_p);
3649
3650   Qcoding_system_error = intern ("coding-system-error");
3651   staticpro (&Qcoding_system_error);
3652
3653   Fput (Qcoding_system_error, Qerror_conditions,
3654         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3655   Fput (Qcoding_system_error, Qerror_message,
3656         build_string ("Coding-system error"));
3657
3658   Qcoding_category_index = intern ("coding-category-index");
3659   staticpro (&Qcoding_category_index);
3660
3661   {
3662     int i;
3663     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3664       {
3665         coding_category_table[i] = intern (coding_category_name[i]);
3666         staticpro (&coding_category_table[i]);
3667         Fput (coding_category_table[i], Qcoding_category_index,
3668               make_number (i));
3669       }
3670   }
3671
3672   Qcharacter_unification_table = intern ("character-unification-table");
3673   staticpro (&Qcharacter_unification_table);
3674   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3675         make_number (0));
3676
3677   Qcharacter_unification_table_for_decode
3678     = intern ("character-unification-table-for-decode");
3679   staticpro (&Qcharacter_unification_table_for_decode);
3680
3681   Qcharacter_unification_table_for_encode
3682     = intern ("character-unification-table-for-encode");
3683   staticpro (&Qcharacter_unification_table_for_encode);
3684
3685   defsubr (&Scoding_system_spec);
3686   defsubr (&Scoding_system_p);
3687   defsubr (&Sread_coding_system);
3688   defsubr (&Sread_non_nil_coding_system);
3689   defsubr (&Scheck_coding_system);
3690   defsubr (&Sdetect_coding_region);
3691   defsubr (&Sdecode_coding_region);
3692   defsubr (&Sencode_coding_region);
3693   defsubr (&Sdecode_coding_string);
3694   defsubr (&Sencode_coding_string);
3695   defsubr (&Sdecode_sjis_char);
3696   defsubr (&Sencode_sjis_char);
3697   defsubr (&Sdecode_big5_char);
3698   defsubr (&Sencode_big5_char);
3699   defsubr (&Sset_terminal_coding_system_internal);
3700   defsubr (&Sterminal_coding_system);
3701   defsubr (&Sset_keyboard_coding_system_internal);
3702   defsubr (&Skeyboard_coding_system);
3703   defsubr (&Sfind_operation_coding_system);
3704
3705   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3706     "List of coding-categories (symbols) ordered by priority.");
3707   {
3708     int i;
3709
3710     Vcoding_category_list = Qnil;
3711     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3712       Vcoding_category_list
3713         = Fcons (coding_category_table[i], Vcoding_category_list);
3714   }
3715
3716   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3717     "A variable of internal use only.\n\
3718 If the value is a coding system, it is used for decoding on read operation.\n\
3719 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3720   Vcoding_system_for_read = Qnil;
3721
3722   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3723     "A variable of internal use only.\n\
3724 If the value is a coding system, it is used for encoding on write operation.\n\
3725 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3726   Vcoding_system_for_write = Qnil;
3727
3728   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3729     "Coding-system used in the latest file or process I/O.");
3730   Vlast_coding_system_used = Qnil;
3731
3732   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3733     "Alist to decide a coding system to use for a file I/O operation.\n\
3734 The format is ((PATTERN . VAL) ...),\n\
3735 where PATTERN is a regular expression matching a file name,\n\
3736 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3737 If VAL is a coding system, it is used for both decoding and encoding\n\
3738 the file contents.\n\
3739 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3740 and the cdr part is used for encoding.\n\
3741 If VAL is a function symbol, the function must return a coding system\n\
3742 or a cons of coding systems which are used as above.\n\
3743 \n\
3744 See also the function `find-coding-system'.");
3745   Vfile_coding_system_alist = Qnil;
3746
3747   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3748     "Alist to decide a coding system to use for a process I/O operation.\n\
3749 The format is ((PATTERN . VAL) ...),\n\
3750 where PATTERN is a regular expression matching a program name,\n\
3751 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3752 If VAL is a coding system, it is used for both decoding what received\n\
3753 from the program and encoding what sent to the program.\n\
3754 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3755 and the cdr part is used for encoding.\n\
3756 If VAL is a function symbol, the function must return a coding system\n\
3757 or a cons of coding systems which are used as above.\n\
3758 \n\
3759 See also the function `find-coding-system'.");
3760   Vprocess_coding_system_alist = Qnil;
3761
3762   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3763     "Alist to decide a coding system to use for a network I/O operation.\n\
3764 The format is ((PATTERN . VAL) ...),\n\
3765 where PATTERN is a regular expression matching a network service name\n\
3766 or is a port number to connect to,\n\
3767 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3768 If VAL is a coding system, it is used for both decoding what received\n\
3769 from the network stream and encoding what sent to the network stream.\n\
3770 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3771 and the cdr part is used for encoding.\n\
3772 If VAL is a function symbol, the function must return a coding system\n\
3773 or a cons of coding systems which are used as above.\n\
3774 \n\
3775 See also the function `find-coding-system'.");
3776   Vnetwork_coding_system_alist = Qnil;
3777
3778   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3779     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3780   eol_mnemonic_unix = ':';
3781
3782   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3783     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3784   eol_mnemonic_dos = '\\';
3785
3786   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3787     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3788   eol_mnemonic_mac = '/';
3789
3790   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3791     "Mnemonic character indicating end-of-line format is not yet decided.");
3792   eol_mnemonic_undecided = ':';
3793
3794   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3795     "Non-nil means ISO 2022 encoder/decoder do character unification.");
3796   Venable_character_unification = Qt;
3797
3798   DEFVAR_LISP ("standard-character-unification-table-for-decode",
3799     &Vstandard_character_unification_table_for_decode,
3800     "Table for unifying characters when reading.");
3801   Vstandard_character_unification_table_for_decode = Qnil;
3802
3803   DEFVAR_LISP ("standard-character-unification-table-for-encode",
3804     &Vstandard_character_unification_table_for_encode,
3805     "Table for unifying characters when writing.");
3806   Vstandard_character_unification_table_for_encode = Qnil;
3807
3808   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3809     "Alist of charsets vs revision numbers.\n\
3810 While encoding, if a charset (car part of an element) is found,\n\
3811 designate it with the escape sequence identifing revision (cdr part of the element).");
3812   Vcharset_revision_alist = Qnil;
3813
3814   DEFVAR_LISP ("default-process-coding-system",
3815                &Vdefault_process_coding_system,
3816     "Cons of coding systems used for process I/O by default.\n\
3817 The car part is used for decoding a process output,\n\
3818 the cdr part is used for encoding a text to be sent to a process.");
3819   Vdefault_process_coding_system = Qnil;
3820 }
3821
3822 #endif /* emacs */