src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in the section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and such coding
  53   systems used in Internet communication as ISO-2022-JP are all
  54   variants of ISO2022.  Details are described in the section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   the section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in the section 4.  In this file, when written as "BIG5"
  67   (all uppercase), it means the coding system, and when written as
  68   "Big5" (capitalized), it means the character set.
  69
  70   4. Else
  71
  72   If a user want to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represent a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See the section 6 for more
  81   detail.
  82
  83 */
  84
  85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  86
  87   How end-of-line of a text is encoded depends on a system.  For
  88   instance, Unix's format is just one byte of `line-feed' code,
  89   whereas DOS's format is two bytes sequence of `carriage-return' and
  90   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  91
  92   Since how characters in a text is encoded and how end-of-line is
  93   encoded is independent, any coding system described above can take
  94   any format of end-of-line.  So, Emacs has information of format of
  95   end-of-line in each coding-system.  See the section 6 for more
  96   detail.
  97
  98 */
  99
 100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 101
 102   These functions check if a text between SRC and SRC_END is encoded
 103   in the coding system category XXX.  Each returns an integer value in
 104   which appropriate flag bits for the category XXX is set.  The flag
 105   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 106   template of these functions.  */
 107 #if 0
 108 int
 109 detect_coding_emacs_mule (src, src_end)
 110      unsigned char *src, *src_end;
 111 {
 112   ...
 113 }
 114 #endif
 115
 116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 117
 118   These functions decode SRC_BYTES length text at SOURCE encoded in
 119   CODING to Emacs' internal format (emacs-mule).  The resulting text
 120   goes to a place pointed by DESTINATION, the length of which should
 121   not exceed DST_BYTES.  The bytes actually processed is returned as
 122   *CONSUMED.  The return value is the length of the decoded text.
 123   Below is a template of these functions.  */
 124 #if 0
 125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 126      struct coding_system *coding;
 127      unsigned char *source, *destination;
 128      int src_bytes, dst_bytes;
 129      int *consumed;
 130 {
 131   ...
 132 }
 133 #endif
 134
 135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 136
 137   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 138   internal format (emacs-mule) to CODING.  The resulting text goes to
 139   a place pointed by DESTINATION, the length of which should not
 140   exceed DST_BYTES.  The bytes actually processed is returned as
 141   *CONSUMED.  The return value is the length of the encoded text.
 142   Below is a template of these functions.  */
 143 #if 0
 144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148      int *consumed;
 149 {
 150   ...
 151 }
 152 #endif
 153
 154 /*** COMMONLY USED MACROS ***/
 155
 156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 157    THREE_MORE_BYTES safely get one, two, and three bytes from the
 158    source text respectively.  If there are not enough bytes in the
 159    source, they jump to `label_end_of_loop'.  The caller should set
 160    variables `src' and `src_end' to appropriate areas in advance.  */
 161
 162 #define ONE_MORE_BYTE(c1)       \
 163   do {                          \
 164     if (src < src_end)          \
 165       c1 = *src++;              \
 166     else                        \
 167       goto label_end_of_loop;   \
 168   } while (0)
 169
 170 #define TWO_MORE_BYTES(c1, c2)  \
 171   do {                          \
 172     if (src + 1 < src_end)      \
 173       c1 = *src++, c2 = *src++; \
 174     else                        \
 175       goto label_end_of_loop;   \
 176   } while (0)
 177
 178 #define THREE_MORE_BYTES(c1, c2, c3)            \
 179   do {                                          \
 180     if (src + 2 < src_end)                      \
 181       c1 = *src++, c2 = *src++, c3 = *src++;    \
 182     else                                        \
 183       goto label_end_of_loop;                   \
 184   } while (0)
 185
 186 /* The following three macros DECODE_CHARACTER_ASCII,
 187    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 188    the multi-byte form of a character of each class at the place
 189    pointed by `dst'.  The caller should set the variable `dst' to
 190    point to an appropriate area and the variable `coding' to point to
 191    the coding-system of the currently decoding text in advance.  */
 192
 193 /* Decode one ASCII character C.  */
 194
 195 #define DECODE_CHARACTER_ASCII(c)                               \
 196   do {                                                          \
 197     if (COMPOSING_P (coding->composing))                        \
 198       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 199     else                                                        \
 200       *dst++ = (c);                                             \
 201   } while (0)
 202
 203 /* Decode one DIMENSION1 character of which charset is CHARSET and
 204    position-code is C.  */
 205
 206 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 207   do {                                                                  \
 208     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 209     if (COMPOSING_P (coding->composing))                                \
 210       *dst++ = leading_code + 0x20;                                     \
 211     else                                                                \
 212       *dst++ = leading_code;                                            \
 213     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 214       *dst++ = leading_code;                                            \
 215     *dst++ = (c) | 0x80;                                                \
 216   } while (0)
 217
 218 /* Decode one DIMENSION2 character of which charset is CHARSET and
 219    position-codes are C1 and C2.  */
 220
 221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 222   do {                                                  \
 223     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 224     *dst++ = (c2) | 0x80;                               \
 225   } while (0)
 226
 227 \f
 228 /*** 1. Preamble ***/
 229
 230 #include <stdio.h>
 231
 232 #ifdef emacs
 233
 234 #include <config.h>
 235 #include "lisp.h"
 236 #include "buffer.h"
 237 #include "charset.h"
 238 #include "ccl.h"
 239 #include "coding.h"
 240 #include "window.h"
 241
 242 #else  /* not emacs */
 243
 244 #include "mulelib.h"
 245
 246 #endif /* not emacs */
 247
 248 Lisp_Object Qcoding_system, Qeol_type;
 249 Lisp_Object Qbuffer_file_coding_system;
 250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 251
 252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 254 Lisp_Object Qstart_process, Qopen_network_stream;
 255 Lisp_Object Qtarget_idx;
 256
 257 /* Mnemonic character of each format of end-of-line.  */
 258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 259 /* Mnemonic character to indicate format of end-of-line is not yet
 260    decided.  */
 261 int eol_mnemonic_undecided;
 262
 263 #ifdef emacs
 264
 265 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 266
 267 /* Coding-systems are handed between Emacs Lisp programs and C internal
 268    routines by the following three variables.  */
 269 /* Coding-system for reading files and receiving data from process.  */
 270 Lisp_Object Vcoding_system_for_read;
 271 /* Coding-system for writing files and sending data to process.  */
 272 Lisp_Object Vcoding_system_for_write;
 273 /* Coding-system actually used in the latest I/O.  */
 274 Lisp_Object Vlast_coding_system_used;
 275
 276 /* Coding-system of what terminal accept for displaying.  */
 277 struct coding_system terminal_coding;
 278
 279 /* Coding-system of what is sent from terminal keyboard.  */
 280 struct coding_system keyboard_coding;
 281
 282 Lisp_Object Vfile_coding_system_alist;
 283 Lisp_Object Vprocess_coding_system_alist;
 284 Lisp_Object Vnetwork_coding_system_alist;
 285
 286 #endif /* emacs */
 287
 288 Lisp_Object Qcoding_category_index;
 289
 290 /* List of symbols `coding-category-xxx' ordered by priority.  */
 291 Lisp_Object Vcoding_category_list;
 292
 293 /* Table of coding-systems currently assigned to each coding-category.  */
 294 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 295
 296 /* Table of names of symbol for each coding-category.  */
 297 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 298   "coding-category-emacs-mule",
 299   "coding-category-sjis",
 300   "coding-category-iso-7",
 301   "coding-category-iso-8-1",
 302   "coding-category-iso-8-2",
 303   "coding-category-iso-else",
 304   "coding-category-big5",
 305   "coding-category-binary"
 306 };
 307
 308 /* Flag to tell if we look up unification table on character code
 309    conversion.  */
 310 Lisp_Object Venable_character_unification;
 311 /* Standard unification table to look up on reading (decoding).  */
 312 Lisp_Object Vstandard_character_unification_table_for_read;
 313 /* Standard unification table to look up on writing (encoding).  */
 314 Lisp_Object Vstandard_character_unification_table_for_write;
 315
 316 Lisp_Object Qcharacter_unification_table;
 317
 318 /* Alist of charsets vs revision number.  */
 319 Lisp_Object Vcharset_revision_alist;
 320
 321 /* Default coding systems used for process I/O.  */
 322 Lisp_Object Vdefault_process_coding_system;
 323
 324 \f
 325 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 326
 327 /* Emacs' internal format for encoding multiple character sets is a
 328    kind of multi-byte encoding, i.e. encoding a character by a sequence
 329    of one-byte codes of variable length.  ASCII characters and control
 330    characters (e.g. `tab', `newline') are represented by one-byte as
 331    is.  It takes the range 0x00 through 0x7F.  The other characters
 332    are represented by a sequence of `base leading-code', optional
 333    `extended leading-code', and one or two `position-code's.  Length
 334    of the sequence is decided by the base leading-code.  Leading-code
 335    takes the range 0x80 through 0x9F, whereas extended leading-code
 336    and position-code take the range 0xA0 through 0xFF.  See the
 337    document of `charset.h' for more detail about leading-code and
 338    position-code.
 339
 340    There's one exception in this rule.  Special leading-code
 341    `leading-code-composition' denotes that the following several
 342    characters should be composed into one character.  Leading-codes of
 343    components (except for ASCII) are added 0x20.  An ASCII character
 344    component is represented by a 2-byte sequence of `0xA0' and
 345    `ASCII-code + 0x80'.  See also the document in `charset.h' for the
 346    detail of composite character.  Hence, we can summarize the code
 347    range as follows:
 348
 349    --- CODE RANGE of Emacs' internal format ---
 350    (character set)      (range)
 351    ASCII                0x00 .. 0x7F
 352    ELSE (1st byte)      0x80 .. 0x9F
 353         (rest bytes)    0xA0 .. 0xFF
 354    ---------------------------------------------
 355
 356   */
 357
 358 enum emacs_code_class_type emacs_code_class[256];
 359
 360 /* Go to the next statement only if *SRC is accessible and the code is
 361    greater than 0xA0.  */
 362 #define CHECK_CODE_RANGE_A0_FF  \
 363   do {                          \
 364     if (src >= src_end)         \
 365       goto label_end_of_switch; \
 366     else if (*src++ < 0xA0)     \
 367       return 0;                 \
 368   } while (0)
 369
 370 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 371    Check if a text is encoded in Emacs' internal format.  If it is,
 372    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 373
 374 int
 375 detect_coding_emacs_mule (src, src_end)
 376      unsigned char *src, *src_end;
 377 {
 378   unsigned char c;
 379   int composing = 0;
 380
 381   while (src < src_end)
 382     {
 383       c = *src++;
 384
 385       if (composing)
 386         {
 387           if (c < 0xA0)
 388             composing = 0;
 389           else
 390             c -= 0x20;
 391         }
 392
 393       switch (emacs_code_class[c])
 394         {
 395         case EMACS_ascii_code:
 396         case EMACS_linefeed_code:
 397           break;
 398
 399         case EMACS_control_code:
 400           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 401             return 0;
 402           break;
 403
 404         case EMACS_invalid_code:
 405           return 0;
 406
 407         case EMACS_leading_code_composition: /* c == 0x80 */
 408           if (composing)
 409             CHECK_CODE_RANGE_A0_FF;
 410           else
 411             composing = 1;
 412           break;
 413
 414         case EMACS_leading_code_4:
 415           CHECK_CODE_RANGE_A0_FF;
 416           /* fall down to check it two more times ...  */
 417
 418         case EMACS_leading_code_3:
 419           CHECK_CODE_RANGE_A0_FF;
 420           /* fall down to check it one more time ...  */
 421
 422         case EMACS_leading_code_2:
 423           CHECK_CODE_RANGE_A0_FF;
 424           break;
 425
 426         default:
 427         label_end_of_switch:
 428           break;
 429         }
 430     }
 431   return CODING_CATEGORY_MASK_EMACS_MULE;
 432 }
 433
 434 \f
 435 /*** 3. ISO2022 handlers ***/
 436
 437 /* The following note describes the coding system ISO2022 briefly.
 438    Since the intension of this note is to help understanding of the
 439    programs in this file, some parts are NOT ACCURATE or OVERLY
 440    SIMPLIFIED.  For the thorough understanding, please refer to the
 441    original document of ISO2022.
 442
 443    ISO2022 provides many mechanisms to encode several character sets
 444    in 7-bit and 8-bit environment.  If one choose 7-bite environment,
 445    all text is encoded by codes of less than 128.  This may make the
 446    encoded text a little bit longer, but the text get more stability
 447    to pass through several gateways (some of them split MSB off).
 448
 449    There are two kind of character set: control character set and
 450    graphic character set.  The former contains control characters such
 451    as `newline' and `escape' to provide control functions (control
 452    functions are provided also by escape sequence).  The latter
 453    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 454    two control character sets and many graphic character sets.
 455
 456    Graphic character sets are classified into one of the following
 457    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 458    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 459    bytes (DIMENSION) and the number of characters in one dimension
 460    (CHARS) of the set.  In addition, each character set is assigned an
 461    identification tag (called "final character" and denoted as <F>
 462    here after) which is unique in each class.  <F> of each character
 463    set is decided by ECMA(*) when it is registered in ISO.  Code range
 464    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 465
 466    Note (*): ECMA = European Computer Manufacturers Association
 467
 468    Here are examples of graphic character set [NAME(<F>)]:
 469         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 470         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 471         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 472         o DIMENSION2_CHARS96 -- none for the moment
 473
 474    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 475         C0 [0x00..0x1F] -- control character plane 0
 476         GL [0x20..0x7F] -- graphic character plane 0
 477         C1 [0x80..0x9F] -- control character plane 1
 478         GR [0xA0..0xFF] -- graphic character plane 1
 479
 480    A control character set is directly designated and invoked to C0 or
 481    C1 by an escape sequence.  The most common case is that ISO646's
 482    control character set is designated/invoked to C0 and ISO6429's
 483    control character set is designated/invoked to C1, and usually
 484    these designations/invocations are omitted in a coded text.  With
 485    7-bit environment, only C0 can be used, and a control character for
 486    C1 is encoded by an appropriate escape sequence to fit in the
 487    environment.  All control characters for C1 are defined the
 488    corresponding escape sequences.
 489
 490    A graphic character set is at first designated to one of four
 491    graphic registers (G0 through G3), then these graphic registers are
 492    invoked to GL or GR.  These designations and invocations can be
 493    done independently.  The most common case is that G0 is invoked to
 494    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 495    these invocations and designations are omitted in a coded text.
 496    With 7-bit environment, only GL can be used.
 497
 498    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 499    and 0x7F of GL area work as control characters SPACE and DEL
 500    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 501
 502    There are two ways of invocation: locking-shift and single-shift.
 503    With locking-shift, the invocation lasts until the next different
 504    invocation, whereas with single-shift, the invocation works only
 505    for the following character and doesn't affect locking-shift.
 506    Invocations are done by the following control characters or escape
 507    sequences.
 508
 509    ----------------------------------------------------------------------
 510    function             control char    escape sequence description
 511    ----------------------------------------------------------------------
 512    SI  (shift-in)               0x0F    none            invoke G0 to GL
 513    SI  (shift-out)              0x0E    none            invoke G1 to GL
 514    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 515    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 516    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 517    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 518    ----------------------------------------------------------------------
 519    The first four are for locking-shift.  Control characters for these
 520    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 521
 522    Designations are done by the following escape sequences.
 523    ----------------------------------------------------------------------
 524    escape sequence      description
 525    ----------------------------------------------------------------------
 526    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 527    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 528    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 529    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 530    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 531    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 532    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 533    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 534    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 535    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 536    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 537    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 538    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 539    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 540    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 541    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 542    ----------------------------------------------------------------------
 543
 544    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 545    of dimension 1, chars 94, and final character <F>, and etc.
 546
 547    Note (*): Although these designations are not allowed in ISO2022,
 548    Emacs accepts them on decoding, and produces them on encoding
 549    CHARS96 character set in a coding system which is characterized as
 550    7-bit environment, non-locking-shift, and non-single-shift.
 551
 552    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 553    '(' can be omitted.  We call this as "short-form" here after.
 554
 555    Now you may notice that there are a lot of ways for encoding the
 556    same multilingual text in ISO2022.  Actually, there exist many
 557    coding systems such as Compound Text (used in X's inter client
 558    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 559    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 560    localized platforms), and all of these are variants of ISO2022.
 561
 562    In addition to the above, Emacs handles two more kinds of escape
 563    sequences: ISO6429's direction specification and Emacs' private
 564    sequence for specifying character composition.
 565
 566    ISO6429's direction specification takes the following format:
 567         o CSI ']'      -- end of the current direction
 568         o CSI '0' ']'  -- end of the current direction
 569         o CSI '1' ']'  -- start of left-to-right text
 570         o CSI '2' ']'  -- start of right-to-left text
 571    The control character CSI (0x9B: control sequence introducer) is
 572    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 573
 574    Character composition specification takes the following format:
 575         o ESC '0' -- start character composition
 576         o ESC '1' -- end character composition
 577    Since these are not standard escape sequences of any ISO, the use
 578    of them for these meaning is restricted to Emacs only.  */
 579
 580 enum iso_code_class_type iso_code_class[256];
 581
 582 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 583    Check if a text is encoded in ISO2022.  If it is, returns an
 584    integer in which appropriate flag bits any of:
 585         CODING_CATEGORY_MASK_ISO_7
 586         CODING_CATEGORY_MASK_ISO_8_1
 587         CODING_CATEGORY_MASK_ISO_8_2
 588         CODING_CATEGORY_MASK_ISO_ELSE
 589    are set.  If a code which should never appear in ISO2022 is found,
 590    returns 0.  */
 591
 592 int
 593 detect_coding_iso2022 (src, src_end)
 594      unsigned char *src, *src_end;
 595 {
 596   int mask = (CODING_CATEGORY_MASK_ISO_7
 597               | CODING_CATEGORY_MASK_ISO_8_1
 598               | CODING_CATEGORY_MASK_ISO_8_2
 599               | CODING_CATEGORY_MASK_ISO_ELSE);
 600   int g1 = 0;                   /* 1 iff designating to G1.  */
 601   int c, i;
 602
 603   while (src < src_end)
 604     {
 605       c = *src++;
 606       switch (c)
 607         {
 608         case ISO_CODE_ESC:
 609           if (src >= src_end)
 610             break;
 611           c = *src++;
 612           if (src < src_end
 613               && ((c >= '(' && c <= '/')
 614                   || c == '$' && ((*src >= '(' && *src <= '/')
 615                                   || (*src >= '@' && *src <= 'B'))))
 616             {
 617               /* Valid designation sequence.  */
 618               if (c == ')' || (c == '$' && *src == ')'))
 619                 {
 620                   g1 = 1;
 621                   mask &= ~CODING_CATEGORY_MASK_ISO_7;
 622                 }
 623               src++;
 624               break;
 625             }
 626           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 627             return CODING_CATEGORY_MASK_ISO_ELSE;
 628           break;
 629
 630         case ISO_CODE_SO:
 631           if (g1)
 632             return CODING_CATEGORY_MASK_ISO_ELSE;
 633           break;
 634
 635         case ISO_CODE_CSI:
 636         case ISO_CODE_SS2:
 637         case ISO_CODE_SS3:
 638           mask &= ~CODING_CATEGORY_MASK_ISO_7;
 639           break;
 640
 641         default:
 642           if (c < 0x80)
 643             break;
 644           else if (c < 0xA0)
 645             return 0;
 646           else
 647             {
 648               int count = 1;
 649
 650               mask &= ~CODING_CATEGORY_MASK_ISO_7;
 651               while (src < src_end && *src >= 0xA0)
 652                 count++, src++;
 653               if (count & 1 && src < src_end)
 654                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 655             }
 656           break;
 657         }
 658     }
 659
 660   return mask;
 661 }
 662
 663 /* Decode a character of which charset is CHARSET and the 1st position
 664    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 665    fetched from SRC and set to C2.  If CHARSET is negative, it means
 666    that we are decoding ill formed text, and what we can do is just to
 667    read C1 as is.  */
 668
 669 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 670   do {                                                                  \
 671     int c_alt, charset_alt = (charset);                                 \
 672     if (COMPOSING_HEAD_P (coding->composing))                           \
 673       {                                                                 \
 674         *dst++ = LEADING_CODE_COMPOSITION;                              \
 675         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 676           /* To tell composition rules are embeded.  */                 \
 677           *dst++ = 0xFF;                                                \
 678         coding->composing += 2;                                         \
 679       }                                                                 \
 680     if ((charset) >= 0)                                                 \
 681       {                                                                 \
 682         if (CHARSET_DIMENSION (charset) == 2)                           \
 683           ONE_MORE_BYTE (c2);                                           \
 684         if (!NILP (unification_table)                                   \
 685             && ((c_alt = unify_char (unification_table,                 \
 686                                      -1, (charset), c1, c2)) >= 0))     \
 687           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 688       }                                                                 \
 689     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 690       DECODE_CHARACTER_ASCII (c1);                                      \
 691     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 692       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 693     else                                                                \
 694       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 695     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 696       /* To tell a composition rule follows.  */                        \
 697       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 698   } while (0)
 699
 700 /* Set designation state into CODING.  */
 701 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 702   do {                                                                  \
 703     int charset = ISO_CHARSET_TABLE (dimension, chars, final_char);     \
 704     if (charset >= 0)                                                   \
 705       {                                                                 \
 706         if (coding->direction == 1                                      \
 707             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 708           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 709         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 710       }                                                                 \
 711   } while (0)
 712
 713 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 714
 715 int
 716 decode_coding_iso2022 (coding, source, destination,
 717                        src_bytes, dst_bytes, consumed)
 718      struct coding_system *coding;
 719      unsigned char *source, *destination;
 720      int src_bytes, dst_bytes;
 721      int *consumed;
 722 {
 723   unsigned char *src = source;
 724   unsigned char *src_end = source + src_bytes;
 725   unsigned char *dst = destination;
 726   unsigned char *dst_end = destination + dst_bytes;
 727   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 728      from DST_END to assure that overflow checking is necessary only
 729      at the head of loop.  */
 730   unsigned char *adjusted_dst_end = dst_end - 6;
 731   int charset;
 732   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 733   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 734   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 735   Lisp_Object unification_table = coding->character_unification_table;
 736
 737   if (!NILP (Venable_character_unification) && NILP (unification_table))
 738     unification_table = Vstandard_character_unification_table_for_read;
 739
 740   while (src < src_end && dst < adjusted_dst_end)
 741     {
 742       /* SRC_BASE remembers the start position in source in each loop.
 743          The loop will be exited when there's not enough source text
 744          to analyze long escape sequence or 2-byte code (within macros
 745          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 746          to SRC_BASE before exiting.  */
 747       unsigned char *src_base = src;
 748       int c1 = *src++, c2;
 749
 750       switch (iso_code_class [c1])
 751         {
 752         case ISO_0x20_or_0x7F:
 753           if (!coding->composing
 754               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 755             {
 756               /* This is SPACE or DEL.  */
 757               *dst++ = c1;
 758               break;
 759             }
 760           /* This is a graphic character, we fall down ...  */
 761
 762         case ISO_graphic_plane_0:
 763           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 764             {
 765               /* This is a composition rule.  */
 766               *dst++ = c1 | 0x80;
 767               coding->composing = COMPOSING_WITH_RULE_TAIL;
 768             }
 769           else
 770             DECODE_ISO_CHARACTER (charset0, c1);
 771           break;
 772
 773         case ISO_0xA0_or_0xFF:
 774           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 775             {
 776               /* Invalid code.  */
 777               *dst++ = c1;
 778               break;
 779             }
 780           /* This is a graphic character, we fall down ... */
 781
 782         case ISO_graphic_plane_1:
 783           DECODE_ISO_CHARACTER (charset1, c1);
 784           break;
 785
 786         case ISO_control_code:
 787           /* All ISO2022 control characters in this class have the
 788              same representation in Emacs internal format.  */
 789           *dst++ = c1;
 790           break;
 791
 792         case ISO_carriage_return:
 793           if (coding->eol_type == CODING_EOL_CR)
 794             {
 795               *dst++ = '\n';
 796             }
 797           else if (coding->eol_type == CODING_EOL_CRLF)
 798             {
 799               ONE_MORE_BYTE (c1);
 800               if (c1 == ISO_CODE_LF)
 801                 *dst++ = '\n';
 802               else
 803                 {
 804                   src--;
 805                   *dst++ = c1;
 806                 }
 807             }
 808           else
 809             {
 810               *dst++ = c1;
 811             }
 812           break;
 813
 814         case ISO_shift_out:
 815           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 816             goto label_invalid_escape_sequence;
 817           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 818           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 819           break;
 820
 821         case ISO_shift_in:
 822           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 823           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 824           break;
 825
 826         case ISO_single_shift_2_7:
 827         case ISO_single_shift_2:
 828           /* SS2 is handled as an escape sequence of ESC 'N' */
 829           c1 = 'N';
 830           goto label_escape_sequence;
 831
 832         case ISO_single_shift_3:
 833           /* SS2 is handled as an escape sequence of ESC 'O' */
 834           c1 = 'O';
 835           goto label_escape_sequence;
 836
 837         case ISO_control_sequence_introducer:
 838           /* CSI is handled as an escape sequence of ESC '[' ...  */
 839           c1 = '[';
 840           goto label_escape_sequence;
 841
 842         case ISO_escape:
 843           ONE_MORE_BYTE (c1);
 844         label_escape_sequence:
 845           /* Escape sequences handled by Emacs are invocation,
 846              designation, direction specification, and character
 847              composition specification.  */
 848           switch (c1)
 849             {
 850             case '&':           /* revision of following character set */
 851               ONE_MORE_BYTE (c1);
 852               if (!(c1 >= '@' && c1 <= '~'))
 853                 goto label_invalid_escape_sequence;
 854               ONE_MORE_BYTE (c1);
 855               if (c1 != ISO_CODE_ESC)
 856                 goto label_invalid_escape_sequence;
 857               ONE_MORE_BYTE (c1);
 858               goto label_escape_sequence;
 859
 860             case '$':           /* designation of 2-byte character set */
 861               ONE_MORE_BYTE (c1);
 862               if (c1 >= '@' && c1 <= 'B')
 863                 {       /* designation of JISX0208.1978, GB2312.1980,
 864                                    or JISX0208.1980 */
 865                   DECODE_DESIGNATION (0, 2, 94, c1);
 866                 }
 867               else if (c1 >= 0x28 && c1 <= 0x2B)
 868                 {       /* designation of DIMENSION2_CHARS94 character set */
 869                   ONE_MORE_BYTE (c2);
 870                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 871                 }
 872               else if (c1 >= 0x2C && c1 <= 0x2F)
 873                 {       /* designation of DIMENSION2_CHARS96 character set */
 874                   ONE_MORE_BYTE (c2);
 875                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 876                 }
 877               else
 878                 goto label_invalid_escape_sequence;
 879               break;
 880
 881             case 'n':           /* invocation of locking-shift-2 */
 882               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 883                 goto label_invalid_escape_sequence;
 884               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 885               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 886               break;
 887
 888             case 'o':           /* invocation of locking-shift-3 */
 889               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 890                 goto label_invalid_escape_sequence;
 891               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 892               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 893               break;
 894
 895             case 'N':           /* invocation of single-shift-2 */
 896               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 897                 goto label_invalid_escape_sequence;
 898               ONE_MORE_BYTE (c1);
 899               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 900               DECODE_ISO_CHARACTER (charset, c1);
 901               break;
 902
 903             case 'O':           /* invocation of single-shift-3 */
 904               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 905                 goto label_invalid_escape_sequence;
 906               ONE_MORE_BYTE (c1);
 907               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 908               DECODE_ISO_CHARACTER (charset, c1);
 909               break;
 910
 911             case '0':           /* start composing without embeded rules */
 912               coding->composing = COMPOSING_NO_RULE_HEAD;
 913               break;
 914
 915             case '1':           /* end composing */
 916               coding->composing = COMPOSING_NO;
 917               break;
 918
 919             case '2':           /* start composing with embeded rules */
 920               coding->composing = COMPOSING_WITH_RULE_HEAD;
 921               break;
 922
 923             case '[':           /* specification of direction */
 924               /* For the moment, nested direction is not supported.
 925                  So, the value of `coding->direction' is 0 or 1: 0
 926                  means left-to-right, 1 means right-to-left.  */
 927               ONE_MORE_BYTE (c1);
 928               switch (c1)
 929                 {
 930                 case ']':       /* end of the current direction */
 931                   coding->direction = 0;
 932
 933                 case '0':       /* end of the current direction */
 934                 case '1':       /* start of left-to-right direction */
 935                   ONE_MORE_BYTE (c1);
 936                   if (c1 == ']')
 937                     coding->direction = 0;
 938                   else
 939                     goto label_invalid_escape_sequence;
 940                   break;
 941
 942                 case '2':       /* start of right-to-left direction */
 943                   ONE_MORE_BYTE (c1);
 944                   if (c1 == ']')
 945                     coding->direction= 1;
 946                   else
 947                     goto label_invalid_escape_sequence;
 948                   break;
 949
 950                 default:
 951                   goto label_invalid_escape_sequence;
 952                 }
 953               break;
 954
 955             default:
 956               if (c1 >= 0x28 && c1 <= 0x2B)
 957                 {       /* designation of DIMENSION1_CHARS94 character set */
 958                   ONE_MORE_BYTE (c2);
 959                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
 960                 }
 961               else if (c1 >= 0x2C && c1 <= 0x2F)
 962                 {       /* designation of DIMENSION1_CHARS96 character set */
 963                   ONE_MORE_BYTE (c2);
 964                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
 965                 }
 966               else
 967                 {
 968                   goto label_invalid_escape_sequence;
 969                 }
 970             }
 971           /* We must update these variables now.  */
 972           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 973           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 974           break;
 975
 976         label_invalid_escape_sequence:
 977           {
 978             int length = src - src_base;
 979
 980             bcopy (src_base, dst, length);
 981             dst += length;
 982           }
 983         }
 984       continue;
 985
 986     label_end_of_loop:
 987       coding->carryover_size = src - src_base;
 988       bcopy (src_base, coding->carryover, coding->carryover_size);
 989       src = src_base;
 990       break;
 991     }
 992
 993   /* If this is the last block of the text to be decoded, we had
 994      better just flush out all remaining codes in the text although
 995      they are not valid characters.  */
 996   if (coding->last_block)
 997     {
 998       bcopy (src, dst, src_end - src);
 999       dst += (src_end - src);
1000       src = src_end;
1001     }
1002   *consumed = src - source;
1003   return dst - destination;
1004 }
1005
1006 /* ISO2022 encoding staffs.  */
1007
1008 /*
1009    It is not enough to say just "ISO2022" on encoding, but we have to
1010    specify more details.  In Emacs, each coding-system of ISO2022
1011    variant has the following specifications:
1012         1. Initial designation to G0 thru G3.
1013         2. Allows short-form designation?
1014         3. ASCII should be designated to G0 before control characters?
1015         4. ASCII should be designated to G0 at end of line?
1016         5. 7-bit environment or 8-bit environment?
1017         6. Use locking-shift?
1018         7. Use Single-shift?
1019    And the following two are only for Japanese:
1020         8. Use ASCII in place of JIS0201-1976-Roman?
1021         9. Use JISX0208-1983 in place of JISX0208-1978?
1022    These specifications are encoded in `coding->flags' as flag bits
1023    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1024    detail.
1025 */
1026
1027 /* Produce codes (escape sequence) for designating CHARSET to graphic
1028    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1029    the coding system CODING allows, produce designation sequence of
1030    short-form.  */
1031
1032 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1033   do {                                                                  \
1034     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1035     char *intermediate_char_94 = "()*+";                                \
1036     char *intermediate_char_96 = ",-./";                                \
1037     Lisp_Object temp                                                    \
1038       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1039     if (! NILP (temp))                                                  \
1040         {                                                               \
1041         *dst++ = ISO_CODE_ESC;                                          \
1042         *dst++ = '&';                                                   \
1043         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1044       }                                                                 \
1045     *dst++ = ISO_CODE_ESC;                                              \
1046     if (CHARSET_DIMENSION (charset) == 1)                               \
1047       {                                                                 \
1048         if (CHARSET_CHARS (charset) == 94)                              \
1049           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1050         else                                                            \
1051           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1052       }                                                                 \
1053     else                                                                \
1054       {                                                                 \
1055         *dst++ = '$';                                                   \
1056         if (CHARSET_CHARS (charset) == 94)                              \
1057           {                                                             \
1058             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1059                 || reg != 0                                             \
1060                 || final_char < '@' || final_char > 'B')                \
1061               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1062           }                                                             \
1063         else                                                            \
1064           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1065       }                                                                 \
1066     *dst++ = final_char;                                                \
1067     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1068   } while (0)
1069
1070 /* The following two macros produce codes (control character or escape
1071    sequence) for ISO2022 single-shift functions (single-shift-2 and
1072    single-shift-3).  */
1073
1074 #define ENCODE_SINGLE_SHIFT_2                           \
1075   do {                                                  \
1076     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1077       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1078     else                                                \
1079       *dst++ = ISO_CODE_SS2;                            \
1080     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1081   } while (0)
1082
1083 #define ENCODE_SINGLE_SHIFT_3                           \
1084   do {                                                  \
1085     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1086       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1087     else                                                \
1088       *dst++ = ISO_CODE_SS3;                            \
1089     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1090   } while (0)
1091
1092 /* The following four macros produce codes (control character or
1093    escape sequence) for ISO2022 locking-shift functions (shift-in,
1094    shift-out, locking-shift-2, and locking-shift-3).  */
1095
1096 #define ENCODE_SHIFT_IN                         \
1097   do {                                          \
1098     *dst++ = ISO_CODE_SI;                       \
1099     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1100   } while (0)
1101
1102 #define ENCODE_SHIFT_OUT                        \
1103   do {                                          \
1104     *dst++ = ISO_CODE_SO;                       \
1105     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1106   } while (0)
1107
1108 #define ENCODE_LOCKING_SHIFT_2                  \
1109   do {                                          \
1110     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1111     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1112   } while (0)
1113
1114 #define ENCODE_LOCKING_SHIFT_3                  \
1115   do {                                          \
1116     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1117     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1118   } while (0)
1119
1120 /* Produce codes for a DIMENSION1 character of which character set is
1121    CHARSET and position-code is C1.  Designation and invocation
1122    sequences are also produced in advance if necessary.  */
1123
1124
1125 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1126   do {                                                                  \
1127     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1128       {                                                                 \
1129         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1130           *dst++ = c1 & 0x7F;                                           \
1131         else                                                            \
1132           *dst++ = c1 | 0x80;                                           \
1133         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1134         break;                                                          \
1135       }                                                                 \
1136     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1137       {                                                                 \
1138         *dst++ = c1 & 0x7F;                                             \
1139         break;                                                          \
1140       }                                                                 \
1141     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1142       {                                                                 \
1143         *dst++ = c1 | 0x80;                                             \
1144         break;                                                          \
1145       }                                                                 \
1146     else                                                                \
1147       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1148          must invoke it, or, at first, designate it to some graphic     \
1149          register.  Then repeat the loop to actually produce the        \
1150          character.  */                                                 \
1151       dst = encode_invocation_designation (charset, coding, dst);       \
1152   } while (1)
1153
1154 /* Produce codes for a DIMENSION2 character of which character set is
1155    CHARSET and position-codes are C1 and C2.  Designation and
1156    invocation codes are also produced in advance if necessary.  */
1157
1158 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1159   do {                                                                  \
1160     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1161       {                                                                 \
1162         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1163           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1164         else                                                            \
1165           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1166         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1167         break;                                                          \
1168       }                                                                 \
1169     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1170       {                                                                 \
1171         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1172         break;                                                          \
1173       }                                                                 \
1174     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1175       {                                                                 \
1176         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1177         break;                                                          \
1178       }                                                                 \
1179     else                                                                \
1180       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1181          must invoke it, or, at first, designate it to some graphic     \
1182          register.  Then repeat the loop to actually produce the        \
1183          character.  */                                                 \
1184       dst = encode_invocation_designation (charset, coding, dst);       \
1185   } while (1)
1186
1187 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1188   do {                                                                    \
1189     int c_alt, charset_alt;                                               \
1190     if (!NILP (unification_table)                                         \
1191         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1192             < 0))                                                         \
1193       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1194     else                                                                  \
1195       charset_alt = charset;                                              \
1196     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1197       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1198     else                                                                  \
1199       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1200   } while (0)
1201
1202 /* Produce designation and invocation codes at a place pointed by DST
1203    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1204    Return new DST.  */
1205
1206 unsigned char *
1207 encode_invocation_designation (charset, coding, dst)
1208      int charset;
1209      struct coding_system *coding;
1210      unsigned char *dst;
1211 {
1212   int reg;                      /* graphic register number */
1213
1214   /* At first, check designations.  */
1215   for (reg = 0; reg < 4; reg++)
1216     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1217       break;
1218
1219   if (reg >= 4)
1220     {
1221       /* CHARSET is not yet designated to any graphic registers.  */
1222       /* At first check the requested designation.  */
1223       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1224       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1225         /* Since CHARSET requests no special designation, designate it
1226            to graphic register 0.  */
1227         reg = 0;
1228
1229       ENCODE_DESIGNATION (charset, reg, coding);
1230     }
1231
1232   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1233       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1234     {
1235       /* Since the graphic register REG is not invoked to any graphic
1236          planes, invoke it to graphic plane 0.  */
1237       switch (reg)
1238         {
1239         case 0:                 /* graphic register 0 */
1240           ENCODE_SHIFT_IN;
1241           break;
1242
1243         case 1:                 /* graphic register 1 */
1244           ENCODE_SHIFT_OUT;
1245           break;
1246
1247         case 2:                 /* graphic register 2 */
1248           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1249             ENCODE_SINGLE_SHIFT_2;
1250           else
1251             ENCODE_LOCKING_SHIFT_2;
1252           break;
1253
1254         case 3:                 /* graphic register 3 */
1255           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1256             ENCODE_SINGLE_SHIFT_3;
1257           else
1258             ENCODE_LOCKING_SHIFT_3;
1259           break;
1260         }
1261     }
1262   return dst;
1263 }
1264
1265 /* The following two macros produce codes for indicating composition.  */
1266 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1267 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1268 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1269
1270 /* The following three macros produce codes for indicating direction
1271    of text.  */
1272 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1273   do {                                                  \
1274     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1275       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1276     else                                                \
1277       *dst++ = ISO_CODE_CSI;                            \
1278   } while (0)
1279
1280 #define ENCODE_DIRECTION_R2L    \
1281   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1282
1283 #define ENCODE_DIRECTION_L2R    \
1284   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1285
1286 /* Produce codes for designation and invocation to reset the graphic
1287    planes and registers to initial state.  */
1288 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1289   do {                                                                      \
1290     int reg;                                                                \
1291     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1292       ENCODE_SHIFT_IN;                                                      \
1293     for (reg = 0; reg < 4; reg++)                                           \
1294       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1295           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1296               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1297         ENCODE_DESIGNATION                                                  \
1298           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1299   } while (0)
1300
1301 /* Produce designation sequences of charsets in the line started from
1302    *SRC to a place pointed by DSTP.
1303
1304    If the current block ends before any end-of-line, we may fail to
1305    find all the necessary *designations.  */
1306 encode_designation_at_bol (coding, table, src, src_end, dstp)
1307      struct coding_system *coding;
1308      Lisp_Object table;
1309      unsigned char *src, *src_end, **dstp;
1310 {
1311   int charset, c, found = 0, reg;
1312   /* Table of charsets to be designated to each graphic register.  */
1313   int r[4];
1314   unsigned char *dst = *dstp;
1315
1316   for (reg = 0; reg < 4; reg++)
1317     r[reg] = -1;
1318
1319   while (src < src_end && *src != '\n' && found < 4)
1320     {
1321       int bytes = BYTES_BY_CHAR_HEAD (*src);
1322
1323       if (NILP (table))
1324         charset = CHARSET_AT (src);
1325       else
1326         {
1327           int c_alt, c1, c2;
1328
1329           SPLIT_STRING(src, bytes, charset, c1, c2);
1330           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1331             charset = CHAR_CHARSET (c_alt);
1332         }
1333
1334       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1335       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1336         {
1337           found++;
1338           r[reg] = charset;
1339         }
1340
1341       src += bytes;
1342     }
1343
1344   if (found)
1345     {
1346       for (reg = 0; reg < 4; reg++)
1347         if (r[reg] >= 0
1348             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1349           ENCODE_DESIGNATION (r[reg], reg, coding);
1350       *dstp = dst;
1351     }
1352 }
1353
1354 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1355
1356 int
1357 encode_coding_iso2022 (coding, source, destination,
1358                        src_bytes, dst_bytes, consumed)
1359      struct coding_system *coding;
1360      unsigned char *source, *destination;
1361      int src_bytes, dst_bytes;
1362      int *consumed;
1363 {
1364   unsigned char *src = source;
1365   unsigned char *src_end = source + src_bytes;
1366   unsigned char *dst = destination;
1367   unsigned char *dst_end = destination + dst_bytes;
1368   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1369      from DST_END to assure overflow checking is necessary only at the
1370      head of loop.  */
1371   unsigned char *adjusted_dst_end = dst_end - 19;
1372   Lisp_Object unification_table = coding->character_unification_table;
1373
1374   if (!NILP (Venable_character_unification) && NILP (unification_table))
1375     unification_table = Vstandard_character_unification_table_for_write;
1376
1377   while (src < src_end && dst < adjusted_dst_end)
1378     {
1379       /* SRC_BASE remembers the start position in source in each loop.
1380          The loop will be exited when there's not enough source text
1381          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1382          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1383          reset to SRC_BASE before exiting.  */
1384       unsigned char *src_base = src;
1385       int charset, c1, c2, c3, c4;
1386
1387       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1388           && CODING_SPEC_ISO_BOL (coding))
1389         {
1390           /* We have to produce designation sequences if any now.  */
1391           encode_designation_at_bol (coding, unification_table,
1392                                      src, src_end, &dst);
1393           CODING_SPEC_ISO_BOL (coding) = 0;
1394         }
1395
1396       c1 = *src++;
1397       /* If we are seeing a component of a composite character, we are
1398          seeing a leading-code specially encoded for composition, or a
1399          composition rule if composing with rule.  We must set C1
1400          to a normal leading-code or an ASCII code.  If we are not at
1401          a composed character, we must reset the composition state.  */
1402       if (COMPOSING_P (coding->composing))
1403         {
1404           if (c1 < 0xA0)
1405             {
1406               /* We are not in a composite character any longer.  */
1407               coding->composing = COMPOSING_NO;
1408               ENCODE_COMPOSITION_END;
1409             }
1410           else
1411             {
1412               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1413                 {
1414                   *dst++ = c1 & 0x7F;
1415                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1416                   continue;
1417                 }
1418               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1419                 coding->composing = COMPOSING_WITH_RULE_RULE;
1420               if (c1 == 0xA0)
1421                 {
1422                   /* This is an ASCII component.  */
1423                   ONE_MORE_BYTE (c1);
1424                   c1 &= 0x7F;
1425                 }
1426               else
1427                 /* This is a leading-code of non ASCII component.  */
1428                 c1 -= 0x20;
1429             }
1430         }
1431
1432       /* Now encode one character.  C1 is a control character, an
1433          ASCII character, or a leading-code of multi-byte character.  */
1434       switch (emacs_code_class[c1])
1435         {
1436         case EMACS_ascii_code:
1437           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1438           break;
1439
1440         case EMACS_control_code:
1441           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1442             ENCODE_RESET_PLANE_AND_REGISTER;
1443           *dst++ = c1;
1444           break;
1445
1446         case EMACS_carriage_return_code:
1447           if (!coding->selective)
1448             {
1449               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1450                 ENCODE_RESET_PLANE_AND_REGISTER;
1451               *dst++ = c1;
1452               break;
1453             }
1454           /* fall down to treat '\r' as '\n' ...  */
1455
1456         case EMACS_linefeed_code:
1457           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1458             ENCODE_RESET_PLANE_AND_REGISTER;
1459           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1460             bcopy (coding->spec.iso2022.initial_designation,
1461                    coding->spec.iso2022.current_designation,
1462                    sizeof coding->spec.iso2022.initial_designation);
1463           if (coding->eol_type == CODING_EOL_LF
1464               || coding->eol_type == CODING_EOL_UNDECIDED)
1465             *dst++ = ISO_CODE_LF;
1466           else if (coding->eol_type == CODING_EOL_CRLF)
1467             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1468           else
1469             *dst++ = ISO_CODE_CR;
1470           CODING_SPEC_ISO_BOL (coding) = 1;
1471           break;
1472
1473         case EMACS_leading_code_2:
1474           ONE_MORE_BYTE (c2);
1475           ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1476           break;
1477
1478         case EMACS_leading_code_3:
1479           TWO_MORE_BYTES (c2, c3);
1480           if (c1 < LEADING_CODE_PRIVATE_11)
1481             ENCODE_ISO_CHARACTER (c1, c2, c3);
1482           else
1483             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1484           break;
1485
1486         case EMACS_leading_code_4:
1487           THREE_MORE_BYTES (c2, c3, c4);
1488           ENCODE_ISO_CHARACTER (c2, c3, c4);
1489           break;
1490
1491         case EMACS_leading_code_composition:
1492           ONE_MORE_BYTE (c1);
1493           if (c1 == 0xFF)
1494             {
1495               coding->composing = COMPOSING_WITH_RULE_HEAD;
1496               ENCODE_COMPOSITION_WITH_RULE_START;
1497             }
1498           else
1499             {
1500               /* Rewind one byte because it is a character code of
1501                  composition elements.  */
1502               src--;
1503               coding->composing = COMPOSING_NO_RULE_HEAD;
1504               ENCODE_COMPOSITION_NO_RULE_START;
1505             }
1506           break;
1507
1508         case EMACS_invalid_code:
1509           *dst++ = c1;
1510           break;
1511         }
1512       continue;
1513     label_end_of_loop:
1514       coding->carryover_size = src - src_base;
1515       bcopy (src_base, coding->carryover, coding->carryover_size);
1516       break;
1517     }
1518
1519   /* If this is the last block of the text to be encoded, we must
1520      reset graphic planes and registers to the initial state.  */
1521   if (src >= src_end && coding->last_block)
1522     {
1523       ENCODE_RESET_PLANE_AND_REGISTER;
1524       if (coding->carryover_size > 0
1525           && coding->carryover_size < (dst_end - dst))
1526         {
1527           bcopy (coding->carryover, dst, coding->carryover_size);
1528           dst += coding->carryover_size;
1529           coding->carryover_size = 0;
1530         }
1531     }
1532   *consumed = src - source;
1533   return dst - destination;
1534 }
1535
1536 \f
1537 /*** 4. SJIS and BIG5 handlers ***/
1538
1539 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1540    quite widely.  So, for the moment, Emacs supports them in the bare
1541    C code.  But, in the future, they may be supported only by CCL.  */
1542
1543 /* SJIS is a coding system encoding three character sets: ASCII, right
1544    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1545    as is.  A character of charset katakana-jisx0201 is encoded by
1546    "position-code + 0x80".  A character of charset japanese-jisx0208
1547    is encoded in 2-byte but two position-codes are divided and shifted
1548    so that it fit in the range below.
1549
1550    --- CODE RANGE of SJIS ---
1551    (character set)      (range)
1552    ASCII                0x00 .. 0x7F
1553    KATAKANA-JISX0201    0xA0 .. 0xDF
1554    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1555             (2nd byte)  0x40 .. 0xFF
1556    -------------------------------
1557
1558 */
1559
1560 /* BIG5 is a coding system encoding two character sets: ASCII and
1561    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1562    character set and is encoded in two-byte.
1563
1564    --- CODE RANGE of BIG5 ---
1565    (character set)      (range)
1566    ASCII                0x00 .. 0x7F
1567    Big5 (1st byte)      0xA1 .. 0xFE
1568         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1569    --------------------------
1570
1571    Since the number of characters in Big5 is larger than maximum
1572    characters in Emacs' charset (96x96), it can't be handled as one
1573    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1574    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1575    contains frequently used characters and the latter contains less
1576    frequently used characters.  */
1577
1578 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1579    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1580    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1581    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1582
1583 /* Number of Big5 characters which have the same code in 1st byte.  */
1584 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1585
1586 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1587   do {                                                                  \
1588     unsigned int temp                                                   \
1589       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1590     if (b1 < 0xC9)                                                      \
1591       charset = charset_big5_1;                                         \
1592     else                                                                \
1593       {                                                                 \
1594         charset = charset_big5_2;                                       \
1595         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1596       }                                                                 \
1597     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1598     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1599   } while (0)
1600
1601 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1602   do {                                                                  \
1603     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1604     if (charset == charset_big5_2)                                      \
1605       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1606     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1607     b2 = temp % BIG5_SAME_ROW;                                          \
1608     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1609   } while (0)
1610
1611 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1612    Check if a text is encoded in SJIS.  If it is, return
1613    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1614
1615 int
1616 detect_coding_sjis (src, src_end)
1617      unsigned char *src, *src_end;
1618 {
1619   unsigned char c;
1620
1621   while (src < src_end)
1622     {
1623       c = *src++;
1624       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1625         return 0;
1626       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1627         {
1628           if (src < src_end && *src++ < 0x40)
1629             return 0;
1630         }
1631     }
1632   return CODING_CATEGORY_MASK_SJIS;
1633 }
1634
1635 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1636    Check if a text is encoded in BIG5.  If it is, return
1637    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1638
1639 int
1640 detect_coding_big5 (src, src_end)
1641      unsigned char *src, *src_end;
1642 {
1643   unsigned char c;
1644
1645   while (src < src_end)
1646     {
1647       c = *src++;
1648       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1649         return 0;
1650       if (c >= 0xA1)
1651         {
1652           if (src >= src_end)
1653             break;
1654           c = *src++;
1655           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1656             return 0;
1657         }
1658     }
1659   return CODING_CATEGORY_MASK_BIG5;
1660 }
1661
1662 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1663    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1664
1665 int
1666 decode_coding_sjis_big5 (coding, source, destination,
1667                          src_bytes, dst_bytes, consumed, sjis_p)
1668      struct coding_system *coding;
1669      unsigned char *source, *destination;
1670      int src_bytes, dst_bytes;
1671      int *consumed;
1672      int sjis_p;
1673 {
1674   unsigned char *src = source;
1675   unsigned char *src_end = source + src_bytes;
1676   unsigned char *dst = destination;
1677   unsigned char *dst_end = destination + dst_bytes;
1678   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1679      from DST_END to assure overflow checking is necessary only at the
1680      head of loop.  */
1681   unsigned char *adjusted_dst_end = dst_end - 3;
1682
1683   while (src < src_end && dst < adjusted_dst_end)
1684     {
1685       /* SRC_BASE remembers the start position in source in each loop.
1686          The loop will be exited when there's not enough source text
1687          to analyze two-byte character (within macro ONE_MORE_BYTE).
1688          In that case, SRC is reset to SRC_BASE before exiting.  */
1689       unsigned char *src_base = src;
1690       unsigned char c1 = *src++, c2, c3, c4;
1691
1692       if (c1 == '\r')
1693         {
1694           if (coding->eol_type == CODING_EOL_CRLF)
1695             {
1696               ONE_MORE_BYTE (c2);
1697               if (c2 == '\n')
1698                 *dst++ = c2;
1699               else
1700                 /* To process C2 again, SRC is subtracted by 1.  */
1701                 *dst++ = c1, src--;
1702             }
1703           else
1704             *dst++ = c1;
1705         }
1706       else if (c1 < 0x80)
1707         *dst++ = c1;
1708       else if (c1 < 0xA0 || c1 >= 0xE0)
1709         {
1710           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1711           if (sjis_p)
1712             {
1713               ONE_MORE_BYTE (c2);
1714               DECODE_SJIS (c1, c2, c3, c4);
1715               DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1716             }
1717           else if (c1 >= 0xE0 && c1 < 0xFF)
1718             {
1719               int charset;
1720
1721               ONE_MORE_BYTE (c2);
1722               DECODE_BIG5 (c1, c2, charset, c3, c4);
1723               DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1724             }
1725           else                  /* Invalid code */
1726             *dst++ = c1;
1727         }
1728       else
1729         {
1730           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1731           if (sjis_p)
1732             DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1733           else
1734             {
1735               int charset;
1736
1737               ONE_MORE_BYTE (c2);
1738               DECODE_BIG5 (c1, c2, charset, c3, c4);
1739               DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1740             }
1741         }
1742       continue;
1743
1744     label_end_of_loop:
1745       coding->carryover_size = src - src_base;
1746       bcopy (src_base, coding->carryover, coding->carryover_size);
1747       src = src_base;
1748       break;
1749     }
1750
1751   *consumed = src - source;
1752   return dst - destination;
1753 }
1754
1755 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1756    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1757    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1758    sure that all these charsets are registered as official charset
1759    (i.e. do not have extended leading-codes).  Characters of other
1760    charsets are produced without any encoding.  If SJIS_P is 1, encode
1761    SJIS text, else encode BIG5 text.  */
1762
1763 int
1764 encode_coding_sjis_big5 (coding, source, destination,
1765                          src_bytes, dst_bytes, consumed, sjis_p)
1766      struct coding_system *coding;
1767      unsigned char *source, *destination;
1768      int src_bytes, dst_bytes;
1769      int *consumed;
1770      int sjis_p;
1771 {
1772   unsigned char *src = source;
1773   unsigned char *src_end = source + src_bytes;
1774   unsigned char *dst = destination;
1775   unsigned char *dst_end = destination + dst_bytes;
1776   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1777      from DST_END to assure overflow checking is necessary only at the
1778      head of loop.  */
1779   unsigned char *adjusted_dst_end = dst_end - 1;
1780
1781   while (src < src_end && dst < adjusted_dst_end)
1782     {
1783       /* SRC_BASE remembers the start position in source in each loop.
1784          The loop will be exited when there's not enough source text
1785          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1786          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1787          before exiting.  */
1788       unsigned char *src_base = src;
1789       unsigned char c1 = *src++, c2, c3, c4;
1790
1791       if (coding->composing)
1792         {
1793           if (c1 == 0xA0)
1794             {
1795               ONE_MORE_BYTE (c1);
1796               c1 &= 0x7F;
1797             }
1798           else if (c1 >= 0xA0)
1799             c1 -= 0x20;
1800           else
1801             coding->composing = 0;
1802         }
1803
1804       switch (emacs_code_class[c1])
1805         {
1806         case EMACS_ascii_code:
1807         case EMACS_control_code:
1808           *dst++ = c1;
1809           break;
1810
1811         case EMACS_carriage_return_code:
1812           if (!coding->selective)
1813             {
1814               *dst++ = c1;
1815               break;
1816             }
1817           /* fall down to treat '\r' as '\n' ...  */
1818
1819         case EMACS_linefeed_code:
1820           if (coding->eol_type == CODING_EOL_LF
1821               || coding->eol_type == CODING_EOL_UNDECIDED)
1822             *dst++ = '\n';
1823           else if (coding->eol_type == CODING_EOL_CRLF)
1824             *dst++ = '\r', *dst++ = '\n';
1825           else
1826             *dst++ = '\r';
1827           break;
1828
1829         case EMACS_leading_code_2:
1830           ONE_MORE_BYTE (c2);
1831           if (sjis_p && c1 == charset_katakana_jisx0201)
1832             *dst++ = c2;
1833           else
1834             *dst++ = c1, *dst++ = c2;
1835           break;
1836
1837         case EMACS_leading_code_3:
1838           TWO_MORE_BYTES (c2, c3);
1839           c2 &= 0x7F, c3 &= 0x7F;
1840           if (sjis_p && c1 == charset_jisx0208)
1841             {
1842               unsigned char s1, s2;
1843
1844               ENCODE_SJIS (c2, c3, s1, s2);
1845               *dst++ = s1, *dst++ = s2;
1846             }
1847           else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1848             {
1849               unsigned char b1, b2;
1850
1851               ENCODE_BIG5 (c1, c2, c3, b1, b2);
1852               *dst++ = b1, *dst++ = b2;
1853             }
1854           else
1855             *dst++ = c1, *dst++ = c2, *dst++ = c3;
1856           break;
1857
1858         case EMACS_leading_code_4:
1859           THREE_MORE_BYTES (c2, c3, c4);
1860           *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1861           break;
1862
1863         case EMACS_leading_code_composition:
1864           coding->composing = 1;
1865           break;
1866
1867         default:                /* i.e. case EMACS_invalid_code: */
1868           *dst++ = c1;
1869         }
1870       continue;
1871
1872     label_end_of_loop:
1873       coding->carryover_size = src - src_base;
1874       bcopy (src_base, coding->carryover, coding->carryover_size);
1875       src = src_base;
1876       break;
1877     }
1878
1879   *consumed = src - source;
1880   return dst - destination;
1881 }
1882
1883 \f
1884 /*** 5. End-of-line handlers ***/
1885
1886 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1887    This function is called only when `coding->eol_type' is
1888    CODING_EOL_CRLF or CODING_EOL_CR.  */
1889
1890 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1891      struct coding_system *coding;
1892      unsigned char *source, *destination;
1893      int src_bytes, dst_bytes;
1894      int *consumed;
1895 {
1896   unsigned char *src = source;
1897   unsigned char *src_end = source + src_bytes;
1898   unsigned char *dst = destination;
1899   unsigned char *dst_end = destination + dst_bytes;
1900   int produced;
1901
1902   switch (coding->eol_type)
1903     {
1904     case CODING_EOL_CRLF:
1905       {
1906         /* Since the maximum bytes produced by each loop is 2, we
1907            subtract 1 from DST_END to assure overflow checking is
1908            necessary only at the head of loop.  */
1909         unsigned char *adjusted_dst_end = dst_end - 1;
1910
1911         while (src < src_end && dst < adjusted_dst_end)
1912           {
1913             unsigned char *src_base = src;
1914             unsigned char c = *src++;
1915             if (c == '\r')
1916               {
1917                 ONE_MORE_BYTE (c);
1918                 if (c != '\n')
1919                   *dst++ = '\r';
1920                 *dst++ = c;
1921               }
1922             else
1923               *dst++ = c;
1924             continue;
1925
1926           label_end_of_loop:
1927             coding->carryover_size = src - src_base;
1928             bcopy (src_base, coding->carryover, coding->carryover_size);
1929             src = src_base;
1930             break;
1931           }
1932         *consumed = src - source;
1933         produced = dst - destination;
1934         break;
1935       }
1936
1937     case CODING_EOL_CR:
1938       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1939       bcopy (source, destination, produced);
1940       dst_end = destination + produced;
1941       while (dst < dst_end)
1942         if (*dst++ == '\r') dst[-1] = '\n';
1943       *consumed = produced;
1944       break;
1945
1946     default:                    /* i.e. case: CODING_EOL_LF */
1947       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1948       bcopy (source, destination, produced);
1949       *consumed = produced;
1950       break;
1951     }
1952
1953   return produced;
1954 }
1955
1956 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
1957    format of end-of-line according to `coding->eol_type'.  If
1958    `coding->selective' is 1, code '\r' in source text also means
1959    end-of-line.  */
1960
1961 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1962      struct coding_system *coding;
1963      unsigned char *source, *destination;
1964      int src_bytes, dst_bytes;
1965      int *consumed;
1966 {
1967   unsigned char *src = source;
1968   unsigned char *dst = destination;
1969   int produced;
1970
1971   if (src_bytes <= 0)
1972     return 0;
1973
1974   switch (coding->eol_type)
1975     {
1976     case CODING_EOL_LF:
1977     case CODING_EOL_UNDECIDED:
1978       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1979       bcopy (source, destination, produced);
1980       if (coding->selective)
1981         {
1982           int i = produced;
1983           while (i--)
1984             if (*dst++ == '\r') dst[-1] = '\n';
1985         }
1986       *consumed = produced;
1987
1988     case CODING_EOL_CRLF:
1989       {
1990         unsigned char c;
1991         unsigned char *src_end = source + src_bytes;
1992         unsigned char *dst_end = destination + dst_bytes;
1993         /* Since the maximum bytes produced by each loop is 2, we
1994            subtract 1 from DST_END to assure overflow checking is
1995            necessary only at the head of loop.  */
1996         unsigned char *adjusted_dst_end = dst_end - 1;
1997
1998         while (src < src_end && dst < adjusted_dst_end)
1999           {
2000             c = *src++;
2001             if (c == '\n' || (c == '\r' && coding->selective))
2002               *dst++ = '\r', *dst++ = '\n';
2003             else
2004               *dst++ = c;
2005           }
2006         produced = dst - destination;
2007         *consumed = src - source;
2008         break;
2009       }
2010
2011     default:                    /* i.e. case CODING_EOL_CR: */
2012       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2013       bcopy (source, destination, produced);
2014       {
2015         int i = produced;
2016         while (i--)
2017           if (*dst++ == '\n') dst[-1] = '\r';
2018       }
2019       *consumed = produced;
2020     }
2021
2022   return produced;
2023 }
2024
2025 \f
2026 /*** 6. C library functions ***/
2027
2028 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2029    has a property `coding-system'.  The value of this property is a
2030    vector of length 5 (called as coding-vector).  Among elements of
2031    this vector, the first (element[0]) and the fifth (element[4])
2032    carry important information for decoding/encoding.  Before
2033    decoding/encoding, this information should be set in fields of a
2034    structure of type `coding_system'.
2035
2036    A value of property `coding-system' can be a symbol of another
2037    subsidiary coding-system.  In that case, Emacs gets coding-vector
2038    from that symbol.
2039
2040    `element[0]' contains information to be set in `coding->type'.  The
2041    value and its meaning is as follows:
2042
2043    0 -- coding_type_emacs_mule
2044    1 -- coding_type_sjis
2045    2 -- coding_type_iso2022
2046    3 -- coding_type_big5
2047    4 -- coding_type_ccl encoder/decoder written in CCL
2048    nil -- coding_type_no_conversion
2049    t -- coding_type_undecided (automatic conversion on decoding,
2050                                no-conversion on encoding)
2051
2052    `element[4]' contains information to be set in `coding->flags' and
2053    `coding->spec'.  The meaning varies by `coding->type'.
2054
2055    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2056    of length 32 (of which the first 13 sub-elements are used now).
2057    Meanings of these sub-elements are:
2058
2059    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2060         If the value is an integer of valid charset, the charset is
2061         assumed to be designated to graphic register N initially.
2062
2063         If the value is minus, it is a minus value of charset which
2064         reserves graphic register N, which means that the charset is
2065         not designated initially but should be designated to graphic
2066         register N just before encoding a character in that charset.
2067
2068         If the value is nil, graphic register N is never used on
2069         encoding.
2070
2071    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2072         Each value takes t or nil.  See the section ISO2022 of
2073         `coding.h' for more information.
2074
2075    If `coding->type' is `coding_type_big5', element[4] is t to denote
2076    BIG5-ETen or nil to denote BIG5-HKU.
2077
2078    If `coding->type' takes the other value, element[4] is ignored.
2079
2080    Emacs Lisp's coding system also carries information about format of
2081    end-of-line in a value of property `eol-type'.  If the value is
2082    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2083    means CODING_EOL_CR.  If it is not integer, it should be a vector
2084    of subsidiary coding systems of which property `eol-type' has one
2085    of above values.
2086
2087 */
2088
2089 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2090    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2091    is setup so that no conversion is necessary and return -1, else
2092    return 0.  */
2093
2094 int
2095 setup_coding_system (coding_system, coding)
2096      Lisp_Object coding_system;
2097      struct coding_system *coding;
2098 {
2099   Lisp_Object type, eol_type;
2100
2101   /* At first, set several fields default values.  */
2102   coding->require_flushing = 0;
2103   coding->last_block = 0;
2104   coding->selective = 0;
2105   coding->composing = 0;
2106   coding->direction = 0;
2107   coding->carryover_size = 0;
2108   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2109   /* We have not yet implemented a way to specify unification table in
2110      a coding system.  */
2111   coding->character_unification_table = Qnil;
2112
2113   Vlast_coding_system_used = coding->symbol = coding_system;
2114   eol_type = Qnil;
2115   /* Get value of property `coding-system' until we get a vector.
2116      While doing that, also get values of properties
2117      `post-read-conversion', `pre-write-conversion', and `eol-type'.  */
2118   while (!NILP (coding_system) && SYMBOLP (coding_system))
2119     {
2120       if (NILP (coding->post_read_conversion))
2121         coding->post_read_conversion = Fget (coding_system,
2122                                              Qpost_read_conversion);
2123       if (NILP (coding->pre_write_conversion))
2124         coding->pre_write_conversion = Fget (coding_system,
2125                                              Qpre_write_conversion);
2126       if (NILP (eol_type))
2127         eol_type = Fget (coding_system, Qeol_type);
2128       coding_system = Fget (coding_system, Qcoding_system);
2129     }
2130   if (!VECTORP (coding_system)
2131       || XVECTOR (coding_system)->size != 5)
2132     goto label_invalid_coding_system;
2133
2134   if (VECTORP (eol_type))
2135     coding->eol_type = CODING_EOL_UNDECIDED;
2136   else if (XFASTINT (eol_type) == 1)
2137     coding->eol_type = CODING_EOL_CRLF;
2138   else if (XFASTINT (eol_type) == 2)
2139     coding->eol_type = CODING_EOL_CR;
2140   else
2141     coding->eol_type = CODING_EOL_LF;
2142
2143   type = XVECTOR (coding_system)->contents[0];
2144   switch (XFASTINT (type))
2145     {
2146     case 0:
2147       coding->type = coding_type_emacs_mule;
2148       break;
2149
2150     case 1:
2151       coding->type = coding_type_sjis;
2152       break;
2153
2154     case 2:
2155       coding->type = coding_type_iso2022;
2156       {
2157         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2158         Lisp_Object *flags;
2159         int i, charset, default_reg_bits = 0;
2160
2161         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2162           goto label_invalid_coding_system;
2163
2164         flags = XVECTOR (val)->contents;
2165         coding->flags
2166           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2167              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2168              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2169              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2170              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2171              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2172              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2173              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2174              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2175              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2176              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2177
2178         /* Invoke graphic register 0 to plane 0.  */
2179         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2180         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2181         CODING_SPEC_ISO_INVOCATION (coding, 1)
2182           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2183         /* Not single shifting at first.  */
2184         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2185         /* Beginning of buffer should also be regarded as bol. */
2186         CODING_SPEC_ISO_BOL(coding) = 1;
2187
2188         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2189            FLAGS[REG] can be one of below:
2190                 integer CHARSET: CHARSET occupies register I,
2191                 t: designate nothing to REG initially, but can be used
2192                   by any charsets,
2193                 list of integer, nil, or t: designate the first
2194                   element (if integer) to REG initially, the remaining
2195                   elements (if integer) is designated to REG on request,
2196                   if an element is t, REG can be used by any charset,
2197                 nil: REG is never used.  */
2198         for (charset = 0; charset <= MAX_CHARSET; charset++)
2199           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2200             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2201         for (i = 0; i < 4; i++)
2202           {
2203             if (INTEGERP (flags[i])
2204                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2205                 || (charset = get_charset_id (flags[i])) >= 0)
2206               {
2207                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2208                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2209               }
2210             else if (EQ (flags[i], Qt))
2211               {
2212                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2213                 default_reg_bits |= 1 << i;
2214               }
2215             else if (CONSP (flags[i]))
2216               {
2217                 Lisp_Object tail = flags[i];
2218
2219                 if (INTEGERP (XCONS (tail)->car)
2220                     && (charset = XINT (XCONS (tail)->car),
2221                         CHARSET_VALID_P (charset))
2222                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2223                   {
2224                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2225                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2226                   }
2227                 else
2228                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2229                 tail = XCONS (tail)->cdr;
2230                 while (CONSP (tail))
2231                   {
2232                     if (INTEGERP (XCONS (tail)->car)
2233                         && (charset = XINT (XCONS (tail)->car),
2234                             CHARSET_VALID_P (charset))
2235                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2236                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2237                         = i;
2238                     else if (EQ (XCONS (tail)->car, Qt))
2239                       default_reg_bits |= 1 << i;
2240                     tail = XCONS (tail)->cdr;
2241                   }
2242               }
2243             else
2244               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2245
2246             CODING_SPEC_ISO_DESIGNATION (coding, i)
2247               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2248           }
2249
2250         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2251           {
2252             /* REG 1 can be used only by locking shift in 7-bit env.  */
2253             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2254               default_reg_bits &= ~2;
2255             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2256               /* Without any shifting, only REG 0 and 1 can be used.  */
2257               default_reg_bits &= 3;
2258           }
2259
2260         for (charset = 0; charset <= MAX_CHARSET; charset++)
2261           if (CHARSET_VALID_P (charset)
2262               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2263                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2264             {
2265               /* We have not yet decided where to designate CHARSET.  */
2266               int reg_bits = default_reg_bits;
2267
2268               if (CHARSET_CHARS (charset) == 96)
2269                 /* A charset of CHARS96 can't be designated to REG 0.  */
2270                 reg_bits &= ~1;
2271
2272               if (reg_bits)
2273                 /* There exist some default graphic register.  */
2274                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2275                   = (reg_bits & 1
2276                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2277               else
2278                 /* We anyway have to designate CHARSET to somewhere.  */
2279                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2280                   = (CHARSET_CHARS (charset) == 94
2281                      ? 0
2282                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2283                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2284                         ? 1
2285                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2286                            ? 2 : 0)));
2287             }
2288       }
2289       coding->require_flushing = 1;
2290       break;
2291
2292     case 3:
2293       coding->type = coding_type_big5;
2294       coding->flags
2295         = (NILP (XVECTOR (coding_system)->contents[4])
2296            ? CODING_FLAG_BIG5_HKU
2297            : CODING_FLAG_BIG5_ETEN);
2298       break;
2299
2300     case 4:
2301       coding->type = coding_type_ccl;
2302       {
2303         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2304         if (CONSP  (val)
2305             && VECTORP (XCONS (val)->car)
2306             && VECTORP (XCONS (val)->cdr))
2307           {
2308             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2309             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2310           }
2311         else
2312           goto label_invalid_coding_system;
2313       }
2314       coding->require_flushing = 1;
2315       break;
2316
2317     default:
2318       if (EQ (type, Qt))
2319         coding->type = coding_type_undecided;
2320       else
2321         coding->type = coding_type_no_conversion;
2322       break;
2323     }
2324   return 0;
2325
2326  label_invalid_coding_system:
2327   coding->type = coding_type_no_conversion;
2328   coding->eol_type = CODING_EOL_LF;
2329   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2330     = Qnil;
2331   return -1;
2332 }
2333
2334 /* Emacs has a mechanism to automatically detect a coding system if it
2335    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2336    it's impossible to distinguish some coding systems accurately
2337    because they use the same range of codes.  So, at first, coding
2338    systems are categorized into 7, those are:
2339
2340    o coding-category-emacs-mule
2341
2342         The category for a coding system which has the same code range
2343         as Emacs' internal format.  Assigned the coding-system (Lisp
2344         symbol) `emacs-mule' by default.
2345
2346    o coding-category-sjis
2347
2348         The category for a coding system which has the same code range
2349         as SJIS.  Assigned the coding-system (Lisp
2350         symbol) `shift-jis' by default.
2351
2352    o coding-category-iso-7
2353
2354         The category for a coding system which has the same code range
2355         as ISO2022 of 7-bit environment.  Assigned the coding-system
2356         (Lisp symbol) `iso-2022-7' by default.
2357
2358    o coding-category-iso-8-1
2359
2360         The category for a coding system which has the same code range
2361         as ISO2022 of 8-bit environment and graphic plane 1 used only
2362         for DIMENSION1 charset.  Assigned the coding-system (Lisp
2363         symbol) `iso-8859-1' by default.
2364
2365    o coding-category-iso-8-2
2366
2367         The category for a coding system which has the same code range
2368         as ISO2022 of 8-bit environment and graphic plane 1 used only
2369         for DIMENSION2 charset.  Assigned the coding-system (Lisp
2370         symbol) `euc-japan' by default.
2371
2372    o coding-category-iso-else
2373
2374         The category for a coding system which has the same code range
2375         as ISO2022 but not belongs to any of the above three
2376         categories.  Assigned the coding-system (Lisp symbol)
2377         `iso-2022-ss2-7' by default.
2378
2379    o coding-category-big5
2380
2381         The category for a coding system which has the same code range
2382         as BIG5.  Assigned the coding-system (Lisp symbol)
2383         `cn-big5' by default.
2384
2385    o coding-category-binary
2386
2387         The category for a coding system not categorized in any of the
2388         above.  Assigned the coding-system (Lisp symbol)
2389         `no-conversion' by default.
2390
2391    Each of them is a Lisp symbol and the value is an actual
2392    `coding-system's (this is also a Lisp symbol) assigned by a user.
2393    What Emacs does actually is to detect a category of coding system.
2394    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2395    decide only one possible category, it selects a category of the
2396    highest priority.  Priorities of categories are also specified by a
2397    user in a Lisp variable `coding-category-list'.
2398
2399 */
2400
2401 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2402    If it detects possible coding systems, return an integer in which
2403    appropriate flag bits are set.  Flag bits are defined by macros
2404    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2405
2406 int
2407 detect_coding_mask (src, src_bytes)
2408      unsigned char *src;
2409      int src_bytes;
2410 {
2411   register unsigned char c;
2412   unsigned char *src_end = src + src_bytes;
2413   int mask;
2414
2415   /* At first, skip all ASCII characters and control characters except
2416      for three ISO2022 specific control characters.  */
2417  label_loop_detect_coding:
2418   while (src < src_end)
2419     {
2420       c = *src;
2421       if (c >= 0x80
2422           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2423         break;
2424       src++;
2425     }
2426
2427   if (src >= src_end)
2428     /* We found nothing other than ASCII.  There's nothing to do.  */
2429     return CODING_CATEGORY_MASK_ANY;
2430
2431   /* The text seems to be encoded in some multilingual coding system.
2432      Now, try to find in which coding system the text is encoded.  */
2433   if (c < 0x80)
2434     {
2435       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2436       /* C is an ISO2022 specific control code of C0.  */
2437       mask = detect_coding_iso2022 (src, src_end);
2438       src++;
2439       if (mask == CODING_CATEGORY_MASK_ANY)
2440         /* No valid ISO2022 code follows C.  Try again.  */
2441         goto label_loop_detect_coding;
2442     }
2443   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2444     /* C is an ISO2022 specific control code of C1,
2445        or the first byte of SJIS's 2-byte character code,
2446        or a leading code of Emacs.  */
2447     mask = (detect_coding_iso2022 (src, src_end)
2448             | detect_coding_sjis (src, src_end)
2449             | detect_coding_emacs_mule (src, src_end));
2450
2451   else if (c < 0xA0)
2452     /* C is the first byte of SJIS character code,
2453        or a leading-code of Emacs.  */
2454     mask = (detect_coding_sjis (src, src_end)
2455             | detect_coding_emacs_mule (src, src_end));
2456
2457   else
2458     /* C is a character of ISO2022 in graphic plane right,
2459        or a SJIS's 1-byte character code (i.e. JISX0201),
2460        or the first byte of BIG5's 2-byte code.  */
2461     mask = (detect_coding_iso2022 (src, src_end)
2462             | detect_coding_sjis (src, src_end)
2463             | detect_coding_big5 (src, src_end));
2464
2465   return mask;
2466 }
2467
2468 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2469    The information of the detected coding system is set in CODING.  */
2470
2471 void
2472 detect_coding (coding, src, src_bytes)
2473      struct coding_system *coding;
2474      unsigned char *src;
2475      int src_bytes;
2476 {
2477   int mask = detect_coding_mask (src, src_bytes);
2478   int idx;
2479
2480   if (mask == CODING_CATEGORY_MASK_ANY)
2481     /* We found nothing other than ASCII.  There's nothing to do.  */
2482     return;
2483
2484   if (!mask)
2485     /* The source text seems to be encoded in unknown coding system.
2486        Emacs regards the category of such a kind of coding system as
2487        `coding-category-binary'.  We assume that a user has assigned
2488        an appropriate coding system for a `coding-category-binary'.  */
2489     idx = CODING_CATEGORY_IDX_BINARY;
2490   else
2491     {
2492       /* We found some plausible coding systems.  Let's use a coding
2493          system of the highest priority.  */
2494       Lisp_Object val = Vcoding_category_list;
2495
2496       if (CONSP (val))
2497         while (!NILP (val))
2498           {
2499             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2500             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2501               break;
2502             val = XCONS (val)->cdr;
2503           }
2504       else
2505         val = Qnil;
2506
2507       if (NILP (val))
2508         {
2509           /* For unknown reason, `Vcoding_category_list' contains none
2510              of found categories.  Let's use any of them.  */
2511           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2512             if (mask & (1 << idx))
2513               break;
2514         }
2515     }
2516   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2517 }
2518
2519 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2520    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2521    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2522
2523 int
2524 detect_eol_type (src, src_bytes)
2525      unsigned char *src;
2526      int src_bytes;
2527 {
2528   unsigned char *src_end = src + src_bytes;
2529   unsigned char c;
2530
2531   while (src < src_end)
2532     {
2533       c = *src++;
2534       if (c == '\n')
2535         return CODING_EOL_LF;
2536       else if (c == '\r')
2537         {
2538           if (src < src_end && *src == '\n')
2539             return CODING_EOL_CRLF;
2540           else
2541             return CODING_EOL_CR;
2542         }
2543     }
2544   return CODING_EOL_UNDECIDED;
2545 }
2546
2547 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2548    is encoded.  If it detects an appropriate format of end-of-line, it
2549    sets the information in *CODING.  */
2550
2551 void
2552 detect_eol (coding, src, src_bytes)
2553      struct coding_system *coding;
2554      unsigned char *src;
2555      int src_bytes;
2556 {
2557   Lisp_Object val;
2558   int eol_type = detect_eol_type (src, src_bytes);
2559
2560   if (eol_type == CODING_EOL_UNDECIDED)
2561     /*  We found no end-of-line in the source text.  */
2562     return;
2563
2564   val = Fget (coding->symbol, Qeol_type);
2565   if (VECTORP (val) && XVECTOR (val)->size == 3)
2566     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2567 }
2568
2569 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2570    decoding, it may detect coding system and format of end-of-line if
2571    those are not yet decided.  */
2572
2573 int
2574 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2575      struct coding_system *coding;
2576      unsigned char *source, *destination;
2577      int src_bytes, dst_bytes;
2578      int *consumed;
2579 {
2580   int produced;
2581
2582   if (src_bytes <= 0)
2583     {
2584       *consumed = 0;
2585       return 0;
2586     }
2587
2588   if (coding->type == coding_type_undecided)
2589     detect_coding (coding, source, src_bytes);
2590
2591   if (coding->eol_type == CODING_EOL_UNDECIDED)
2592     detect_eol (coding, source, src_bytes);
2593
2594   coding->carryover_size = 0;
2595   switch (coding->type)
2596     {
2597     case coding_type_no_conversion:
2598     label_no_conversion:
2599       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2600       bcopy (source, destination, produced);
2601       *consumed = produced;
2602       break;
2603
2604     case coding_type_emacs_mule:
2605     case coding_type_undecided:
2606       if (coding->eol_type == CODING_EOL_LF
2607           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2608         goto label_no_conversion;
2609       produced = decode_eol (coding, source, destination,
2610                              src_bytes, dst_bytes, consumed);
2611       break;
2612
2613     case coding_type_sjis:
2614       produced = decode_coding_sjis_big5 (coding, source, destination,
2615                                           src_bytes, dst_bytes, consumed,
2616                                           1);
2617       break;
2618
2619     case coding_type_iso2022:
2620       produced = decode_coding_iso2022 (coding, source, destination,
2621                                         src_bytes, dst_bytes, consumed);
2622       break;
2623
2624     case coding_type_big5:
2625       produced = decode_coding_sjis_big5 (coding, source, destination,
2626                                           src_bytes, dst_bytes, consumed,
2627                                           0);
2628       break;
2629
2630     case coding_type_ccl:
2631       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2632                              src_bytes, dst_bytes, consumed);
2633       break;
2634     }
2635
2636   return produced;
2637 }
2638
2639 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2640
2641 int
2642 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2643      struct coding_system *coding;
2644      unsigned char *source, *destination;
2645      int src_bytes, dst_bytes;
2646      int *consumed;
2647 {
2648   int produced;
2649
2650   coding->carryover_size = 0;
2651   switch (coding->type)
2652     {
2653     case coding_type_no_conversion:
2654     label_no_conversion:
2655       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2656       if (produced > 0)
2657         {
2658           bcopy (source, destination, produced);
2659           if (coding->selective)
2660             {
2661               unsigned char *p = destination, *pend = destination + produced;
2662               while (p < pend)
2663                 if (*p++ == '\015') p[-1] = '\n';
2664             }
2665         }
2666       *consumed = produced;
2667       break;
2668
2669     case coding_type_emacs_mule:
2670     case coding_type_undecided:
2671       if (coding->eol_type == CODING_EOL_LF
2672           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2673         goto label_no_conversion;
2674       produced = encode_eol (coding, source, destination,
2675                              src_bytes, dst_bytes, consumed);
2676       break;
2677
2678     case coding_type_sjis:
2679       produced = encode_coding_sjis_big5 (coding, source, destination,
2680                                           src_bytes, dst_bytes, consumed,
2681                                           1);
2682       break;
2683
2684     case coding_type_iso2022:
2685       produced = encode_coding_iso2022 (coding, source, destination,
2686                                         src_bytes, dst_bytes, consumed);
2687       break;
2688
2689     case coding_type_big5:
2690       produced = encode_coding_sjis_big5 (coding, source, destination,
2691                                           src_bytes, dst_bytes, consumed,
2692                                           0);
2693       break;
2694
2695     case coding_type_ccl:
2696       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2697                              src_bytes, dst_bytes, consumed);
2698       break;
2699     }
2700
2701   return produced;
2702 }
2703
2704 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2705
2706 /* Return maximum size (bytes) of a buffer enough for decoding
2707    SRC_BYTES of text encoded in CODING.  */
2708
2709 int
2710 decoding_buffer_size (coding, src_bytes)
2711      struct coding_system *coding;
2712      int src_bytes;
2713 {
2714   int magnification;
2715
2716   if (coding->type == coding_type_iso2022)
2717     magnification = 3;
2718   else if (coding->type == coding_type_ccl)
2719     magnification = coding->spec.ccl.decoder.buf_magnification;
2720   else
2721     magnification = 2;
2722
2723   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2724 }
2725
2726 /* Return maximum size (bytes) of a buffer enough for encoding
2727    SRC_BYTES of text to CODING.  */
2728
2729 int
2730 encoding_buffer_size (coding, src_bytes)
2731      struct coding_system *coding;
2732      int src_bytes;
2733 {
2734   int magnification;
2735
2736   if (coding->type == coding_type_ccl)
2737     magnification = coding->spec.ccl.encoder.buf_magnification;
2738   else
2739     magnification = 3;
2740
2741   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2742 }
2743
2744 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2745 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2746 #endif
2747
2748 char *conversion_buffer;
2749 int conversion_buffer_size;
2750
2751 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2752    or decoding.  Sufficient memory is allocated automatically.  If we
2753    run out of memory, return NULL.  */
2754
2755 char *
2756 get_conversion_buffer (size)
2757      int size;
2758 {
2759   if (size > conversion_buffer_size)
2760     {
2761       char *buf;
2762       int real_size = conversion_buffer_size * 2;
2763
2764       while (real_size < size) real_size *= 2;
2765       buf = (char *) xmalloc (real_size);
2766       xfree (conversion_buffer);
2767       conversion_buffer = buf;
2768       conversion_buffer_size = real_size;
2769     }
2770   return conversion_buffer;
2771 }
2772
2773 \f
2774 #ifdef emacs
2775 /*** 7. Emacs Lisp library functions ***/
2776
2777 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
2778        1, 1, 0,
2779   "Return coding-spec of CODING-SYSTEM.\n\
2780 If CODING-SYSTEM is not a valid coding-system, return nil.")
2781   (obj)
2782      Lisp_Object obj;
2783 {
2784   while (SYMBOLP (obj) && !NILP (obj))
2785     obj = Fget (obj, Qcoding_system);
2786   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2787           ? Qnil : obj);
2788 }
2789
2790 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2791   "Return t if OBJECT is nil or a coding-system.\n\
2792 See document of make-coding-system for coding-system object.")
2793   (obj)
2794      Lisp_Object obj;
2795 {
2796   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
2797 }
2798
2799 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2800        Sread_non_nil_coding_system, 1, 1, 0,
2801   "Read a coding system from the minibuffer, prompting with string PROMPT.")
2802   (prompt)
2803      Lisp_Object prompt;
2804 {
2805   Lisp_Object val;
2806   do
2807     {
2808       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
2809                               Qt, Qnil, Qnil, Qnil);
2810     }
2811   while (XSTRING (val)->size == 0);
2812   return (Fintern (val, Qnil));
2813 }
2814
2815 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2816   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2817   (prompt)
2818      Lisp_Object prompt;
2819 {
2820   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2821                                       Qt, Qnil, Qnil, Qnil);
2822   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2823 }
2824
2825 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2826        1, 1, 0,
2827   "Check validity of CODING-SYSTEM.\n\
2828 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2829 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2830 The value of property should be a vector of length 5.")
2831   (coding_system)
2832      Lisp_Object coding_system;
2833 {
2834   CHECK_SYMBOL (coding_system, 0);
2835   if (!NILP (Fcoding_system_p (coding_system)))
2836     return coding_system;
2837   while (1)
2838     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
2839 }
2840
2841 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2842        2, 2, 0,
2843   "Detect coding-system of the text in the region between START and END.\n\
2844 Return a list of possible coding-systems ordered by priority.\n\
2845 If only ASCII characters are found, it returns `undecided'\n\
2846  or its subsidiary coding-system according to a detected end-of-line format.")
2847   (b, e)
2848      Lisp_Object b, e;
2849 {
2850   int coding_mask, eol_type;
2851   Lisp_Object val;
2852   int beg, end;
2853
2854   validate_region (&b, &e);
2855   beg = XINT (b), end = XINT (e);
2856   if (beg < GPT && end >= GPT) move_gap (end);
2857
2858   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2859   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
2860
2861   if (coding_mask == CODING_CATEGORY_MASK_ANY)
2862     {
2863       val = intern ("undecided");
2864       if (eol_type != CODING_EOL_UNDECIDED)
2865         {
2866           Lisp_Object val2 = Fget (val, Qeol_type);
2867           if (VECTORP (val2))
2868             val = XVECTOR (val2)->contents[eol_type];
2869         }
2870     }
2871   else
2872     {
2873       Lisp_Object val2;
2874
2875       /* At first, gather possible coding-systems in VAL in a reverse
2876          order.  */
2877       val = Qnil;
2878       for (val2 = Vcoding_category_list;
2879            !NILP (val2);
2880            val2 = XCONS (val2)->cdr)
2881         {
2882           int idx
2883             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2884           if (coding_mask & (1 << idx))
2885             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2886         }
2887
2888       /* Then, change the order of the list, while getting subsidiary
2889          coding-systems.  */
2890       val2 = val;
2891       val = Qnil;
2892       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2893         {
2894           if (eol_type == CODING_EOL_UNDECIDED)
2895             val = Fcons (XCONS (val2)->car, val);
2896           else
2897             {
2898               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2899               if (VECTORP (val3))
2900                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2901               else
2902                 val = Fcons (XCONS (val2)->car, val);
2903             }
2904         }
2905     }
2906
2907   return val;
2908 }
2909
2910 /* Scan text in the region between *BEGP and *ENDP, skip characters
2911    which we never have to encode to (iff ENCODEP is 1) or decode from
2912    coding system CODING at the head and tail, then set BEGP and ENDP
2913    to the addresses of start and end of the text we actually convert.  */
2914
2915 void
2916 shrink_conversion_area (begp, endp, coding, encodep)
2917      unsigned char **begp, **endp;
2918      struct coding_system *coding;
2919      int encodep;
2920 {
2921   register unsigned char *beg_addr = *begp, *end_addr = *endp;
2922
2923   if (coding->eol_type != CODING_EOL_LF
2924       && coding->eol_type != CODING_EOL_UNDECIDED)
2925     /* Since we anyway have to convert end-of-line format, it is not
2926        worth skipping at most 100 bytes or so.  */
2927     return;
2928
2929   if (encodep)                  /* for encoding */
2930     {
2931       switch (coding->type)
2932         {
2933         case coding_type_no_conversion:
2934         case coding_type_emacs_mule:
2935         case coding_type_undecided:
2936           /* We need no conversion.  */
2937           *begp = *endp;
2938           return;
2939         case coding_type_ccl:
2940           /* We can't skip any data.  */
2941           return;
2942         case coding_type_iso2022:
2943           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2944             {
2945               unsigned char *bol = beg_addr;
2946               while (beg_addr < end_addr && *beg_addr < 0x80)
2947                 {
2948                   beg_addr++;
2949                   if (*(beg_addr - 1) == '\n')
2950                     bol = beg_addr;
2951                 }
2952               beg_addr = bol;
2953               goto label_skip_tail;
2954             }
2955           /* fall down ... */
2956         default:
2957           /* We can skip all ASCII characters at the head and tail.  */
2958           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2959         label_skip_tail:
2960           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2961           break;
2962         }
2963     }
2964   else                          /* for decoding */
2965     {
2966       switch (coding->type)
2967         {
2968         case coding_type_no_conversion:
2969           /* We need no conversion.  */
2970           *begp = *endp;
2971           return;
2972         case coding_type_emacs_mule:
2973           if (coding->eol_type == CODING_EOL_LF)
2974             {
2975               /* We need no conversion.  */
2976               *begp = *endp;
2977               return;
2978             }
2979           /* We can skip all but carriage-return.  */
2980           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2981           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2982           break;
2983         case coding_type_sjis:
2984         case coding_type_big5:
2985           /* We can skip all ASCII characters at the head.  */
2986           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2987           /* We can skip all ASCII characters at the tail except for
2988              the second byte of SJIS or BIG5 code.  */
2989           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2990           if (end_addr != *endp)
2991             end_addr++;
2992           break;
2993         case coding_type_ccl:
2994           /* We can't skip any data.  */
2995           return;
2996         default:                /* i.e. case coding_type_iso2022: */
2997           {
2998             unsigned char c;
2999
3000             /* We can skip all ASCII characters except for a few
3001                control codes at the head.  */
3002             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3003                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3004                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3005               beg_addr++;
3006           }
3007           break;
3008         }
3009     }
3010   *begp = beg_addr;
3011   *endp = end_addr;
3012   return;
3013 }
3014
3015 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3016    text between B and E.  B and E are buffer position.  */
3017
3018 Lisp_Object
3019 code_convert_region (b, e, coding, encodep)
3020      Lisp_Object b, e;
3021      struct coding_system *coding;
3022      int encodep;
3023 {
3024   int beg, end, len, consumed, produced;
3025   char *buf;
3026   unsigned char *begp, *endp;
3027   int pos = PT;
3028
3029   validate_region (&b, &e);
3030   beg = XINT (b), end = XINT (e);
3031   if (beg < GPT && end >= GPT)
3032     move_gap (end);
3033
3034   if (encodep && !NILP (coding->pre_write_conversion))
3035     {
3036       /* We must call a pre-conversion function which may put a new
3037          text to be converted in a new buffer.  */
3038       struct buffer *old = current_buffer, *new;
3039
3040       TEMP_SET_PT (beg);
3041       call2 (coding->pre_write_conversion, b, e);
3042       if (old != current_buffer)
3043         {
3044           /* Replace the original text by the text just generated.  */
3045           len = ZV - BEGV;
3046           new = current_buffer;
3047           set_buffer_internal (old);
3048           del_range (beg, end);
3049           insert_from_buffer (new, 1, len, 0);
3050           end = beg + len;
3051         }
3052     }
3053
3054   /* We may be able to shrink the conversion region.  */
3055   begp = POS_ADDR (beg); endp = begp + (end - beg);
3056   shrink_conversion_area (&begp, &endp, coding, encodep);
3057
3058   if (begp == endp)
3059     /* We need no conversion.  */
3060     len = end - beg;
3061   else
3062     {
3063       beg += begp - POS_ADDR (beg);
3064       end =  beg + (endp - begp);
3065
3066       if (encodep)
3067         len = encoding_buffer_size (coding, end - beg);
3068       else
3069         len = decoding_buffer_size (coding, end - beg);
3070       buf = get_conversion_buffer (len);
3071
3072       coding->last_block = 1;
3073       produced = (encodep
3074                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3075                                    &consumed)
3076                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3077                                    &consumed));
3078
3079       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3080
3081       TEMP_SET_PT (beg);
3082       insert (buf, produced);
3083       del_range (PT, PT + end - beg);
3084       if (pos >= end)
3085         pos = PT + (pos - end);
3086       else if (pos > beg)
3087         pos = beg;
3088       TEMP_SET_PT (pos);
3089   }
3090
3091   if (!encodep && !NILP (coding->post_read_conversion))
3092     {
3093       /* We must call a post-conversion function which may alter
3094          the text just converted.  */
3095       Lisp_Object insval;
3096
3097       beg = XINT (b);
3098       TEMP_SET_PT (beg);
3099       insval = call1 (coding->post_read_conversion, make_number (len));
3100       CHECK_NUMBER (insval, 0);
3101       len = XINT (insval);
3102     }
3103
3104   return make_number (len);
3105 }
3106
3107 Lisp_Object
3108 code_convert_string (str, coding, encodep, nocopy)
3109      Lisp_Object str, nocopy;
3110      struct coding_system *coding;
3111      int encodep;
3112 {
3113   int len, consumed, produced;
3114   char *buf;
3115   unsigned char *begp, *endp;
3116   int head_skip, tail_skip;
3117   struct gcpro gcpro1;
3118
3119   if (encodep && !NILP (coding->pre_write_conversion)
3120       || !encodep && !NILP (coding->post_read_conversion))
3121     {
3122       /* Since we have to call Lisp functions which assume target text
3123          is in a buffer, after setting a temporary buffer, call
3124          code_convert_region.  */
3125       int count = specpdl_ptr - specpdl;
3126       int len = XSTRING (str)->size;
3127       Lisp_Object result;
3128       struct buffer *old = current_buffer;
3129
3130       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3131       temp_output_buffer_setup (" *code-converting-work*");
3132       set_buffer_internal (XBUFFER (Vstandard_output));
3133       insert_from_string (str, 0, len, 0);
3134       code_convert_region (make_number (BEGV), make_number (ZV),
3135                            coding, encodep);
3136       result = make_buffer_string (BEGV, ZV, 0);
3137       set_buffer_internal (old);
3138       return unbind_to (count, result);
3139     }
3140
3141   /* We may be able to shrink the conversion region.  */
3142   begp = XSTRING (str)->data;
3143   endp = begp + XSTRING (str)->size;
3144   shrink_conversion_area (&begp, &endp, coding, encodep);
3145
3146   if (begp == endp)
3147     /* We need no conversion.  */
3148     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3149
3150   head_skip = begp - XSTRING (str)->data;
3151   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3152
3153   GCPRO1 (str);
3154
3155   if (encodep)
3156     len = encoding_buffer_size (coding, endp - begp);
3157   else
3158     len = decoding_buffer_size (coding, endp - begp);
3159   buf = get_conversion_buffer (len + head_skip + tail_skip);
3160
3161   bcopy (XSTRING (str)->data, buf, head_skip);
3162   coding->last_block = 1;
3163   produced = (encodep
3164               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3165                                buf + head_skip, endp - begp, len, &consumed)
3166               : decode_coding (coding, XSTRING (str)->data + head_skip,
3167                                buf + head_skip, endp - begp, len, &consumed));
3168   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3169          buf + head_skip + produced,
3170          tail_skip);
3171
3172   UNGCPRO;
3173
3174   return make_string (buf, head_skip + produced + tail_skip);
3175 }
3176
3177 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3178        3, 3, "r\nzCoding system: ",
3179   "Decode current region by specified coding system.\n\
3180 When called from a program, takes three arguments:\n\
3181 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3182 Return length of decoded text.")
3183   (b, e, coding_system)
3184      Lisp_Object b, e, coding_system;
3185 {
3186   struct coding_system coding;
3187
3188   CHECK_NUMBER_COERCE_MARKER (b, 0);
3189   CHECK_NUMBER_COERCE_MARKER (e, 1);
3190   CHECK_SYMBOL (coding_system, 2);
3191
3192   if (NILP (coding_system))
3193     return make_number (XFASTINT (e) - XFASTINT (b));
3194   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3195     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3196
3197   return code_convert_region (b, e, &coding, 0);
3198 }
3199
3200 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3201        3, 3, "r\nzCoding system: ",
3202   "Encode current region by specified coding system.\n\
3203 When called from a program, takes three arguments:\n\
3204 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3205 Return length of encoded text.")
3206   (b, e, coding_system)
3207      Lisp_Object b, e, coding_system;
3208 {
3209   struct coding_system coding;
3210
3211   CHECK_NUMBER_COERCE_MARKER (b, 0);
3212   CHECK_NUMBER_COERCE_MARKER (e, 1);
3213   CHECK_SYMBOL (coding_system, 2);
3214
3215   if (NILP (coding_system))
3216     return make_number (XFASTINT (e) - XFASTINT (b));
3217   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3218     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3219
3220   return code_convert_region (b, e, &coding, 1);
3221 }
3222
3223 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3224        2, 3, 0,
3225   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3226 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3227 of decoding.")
3228   (string, coding_system, nocopy)
3229      Lisp_Object string, coding_system, nocopy;
3230 {
3231   struct coding_system coding;
3232
3233   CHECK_STRING (string, 0);
3234   CHECK_SYMBOL (coding_system, 1);
3235
3236   if (NILP (coding_system))
3237     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3238   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3239     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3240
3241   return code_convert_string (string, &coding, 0, nocopy);
3242 }
3243
3244 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3245        2, 3, 0,
3246   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3247 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3248 of encoding.")
3249   (string, coding_system, nocopy)
3250      Lisp_Object string, coding_system, nocopy;
3251 {
3252   struct coding_system coding;
3253
3254   CHECK_STRING (string, 0);
3255   CHECK_SYMBOL (coding_system, 1);
3256
3257   if (NILP (coding_system))
3258     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3259   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3260     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3261
3262   return code_convert_string (string, &coding, 1, nocopy);
3263 }
3264
3265 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3266   "Decode a JISX0208 character of shift-jis encoding.\n\
3267 CODE is the character code in SJIS.\n\
3268 Return the corresponding character.")
3269   (code)
3270      Lisp_Object code;
3271 {
3272   unsigned char c1, c2, s1, s2;
3273   Lisp_Object val;
3274
3275   CHECK_NUMBER (code, 0);
3276   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3277   DECODE_SJIS (s1, s2, c1, c2);
3278   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3279   return val;
3280 }
3281
3282 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3283   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3284 Return the corresponding character code in SJIS.")
3285   (ch)
3286      Lisp_Object ch;
3287 {
3288   int charset, c1, c2, s1, s2;
3289   Lisp_Object val;
3290
3291   CHECK_NUMBER (ch, 0);
3292   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3293   if (charset == charset_jisx0208)
3294     {
3295       ENCODE_SJIS (c1, c2, s1, s2);
3296       XSETFASTINT (val, (s1 << 8) | s2);
3297     }
3298   else
3299     XSETFASTINT (val, 0);
3300   return val;
3301 }
3302
3303 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3304   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3305 CODE is the character code in BIG5.\n\
3306 Return the corresponding character.")
3307   (code)
3308      Lisp_Object code;
3309 {
3310   int charset;
3311   unsigned char b1, b2, c1, c2;
3312   Lisp_Object val;
3313
3314   CHECK_NUMBER (code, 0);
3315   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3316   DECODE_BIG5 (b1, b2, charset, c1, c2);
3317   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3318   return val;
3319 }
3320
3321 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3322   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3323 Return the corresponding character code in Big5.")
3324   (ch)
3325      Lisp_Object ch;
3326 {
3327   int charset, c1, c2, b1, b2;
3328   Lisp_Object val;
3329
3330   CHECK_NUMBER (ch, 0);
3331   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3332   if (charset == charset_big5_1 || charset == charset_big5_2)
3333     {
3334       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3335       XSETFASTINT (val, (b1 << 8) | b2);
3336     }
3337   else
3338     XSETFASTINT (val, 0);
3339   return val;
3340 }
3341
3342 DEFUN ("set-terminal-coding-system-internal",
3343        Fset_terminal_coding_system_internal,
3344        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3345   (coding_system)
3346      Lisp_Object coding_system;
3347 {
3348   CHECK_SYMBOL (coding_system, 0);
3349   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3350   return Qnil;
3351 }
3352
3353 DEFUN ("terminal-coding-system",
3354        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3355   "Return coding-system of your terminal.")
3356   ()
3357 {
3358   return terminal_coding.symbol;
3359 }
3360
3361 DEFUN ("set-keyboard-coding-system-internal",
3362        Fset_keyboard_coding_system_internal,
3363        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3364   (coding_system)
3365      Lisp_Object coding_system;
3366 {
3367   CHECK_SYMBOL (coding_system, 0);
3368   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3369   return Qnil;
3370 }
3371
3372 DEFUN ("keyboard-coding-system",
3373        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3374   "Return coding-system of what is sent from terminal keyboard.")
3375   ()
3376 {
3377   return keyboard_coding.symbol;
3378 }
3379
3380 \f
3381 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3382        1, MANY, 0,
3383   "Choose a coding system for a file operation based on file name.\n\
3384 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3385 ENCODING-SYSTEM is the coding system to use for encoding\n\
3386 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3387 for decoding (in case OPERATION does decoding).\n\
3388 \n\
3389 The first argument OPERATION specifies an I/O primitive:\n\
3390   For file I/O, `insert-file-contents' or `write-region'.\n\
3391   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3392   For network I/O, `open-network-stream'.\n\
3393 \n\
3394 The remaining arguments should be the same arguments that were passed\n\
3395 to the primitive.  Depending on which primitive, one of those arguments\n\
3396 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3397 whichever argument specifies the file name is TARGET.\n\
3398 \n\
3399 TARGET has a meaning which depends on OPERATION:\n\
3400   For file I/O, TARGET is a file name.\n\
3401   For process I/O, TARGET is a process name.\n\
3402   For network I/O, TARGET is a service name or a port number\n\
3403 \n\
3404 This function looks up what specified for TARGET in,\n\
3405 `file-coding-system-alist', `process-coding-system-alist',\n\
3406 or `network-coding-system-alist' depending on OPERATION.\n\
3407 They may specify a coding system, a cons of coding systems,\n\
3408 or a function symbol to call.\n\
3409 In the last case, we call the function with one argument,\n\
3410 which is a list of all the arguments given to `find-coding-system'.")
3411   (nargs, args)
3412      int nargs;
3413      Lisp_Object *args;
3414 {
3415   Lisp_Object operation, target_idx, target, val;
3416   register Lisp_Object chain;
3417
3418   if (nargs < 2)
3419     error ("Too few arguments");
3420   operation = args[0];
3421   if (!SYMBOLP (operation)
3422       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3423     error ("Invalid first arguement");
3424   if (nargs < 1 + XINT (target_idx))
3425     error ("Too few arguments for operation: %s",
3426            XSYMBOL (operation)->name->data);
3427   target = args[XINT (target_idx) + 1];
3428   if (!(STRINGP (target)
3429         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3430     error ("Invalid %dth argument", XINT (target_idx) + 1);
3431
3432   chain = (operation == Qinsert_file_contents || operation == Qwrite_region
3433            ? Vfile_coding_system_alist
3434            : (operation == Qopen_network_stream
3435               ? Vnetwork_coding_system_alist
3436               : Vprocess_coding_system_alist));
3437   if (NILP (chain))
3438     return Qnil;
3439
3440   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3441     {
3442       Lisp_Object elt = XCONS (chain)->car;
3443
3444       if (CONSP (elt)
3445           && ((STRINGP (target)
3446                && STRINGP (XCONS (elt)->car)
3447                && fast_string_match (XCONS (elt)->car, target) >= 0)
3448               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3449         {
3450           val = XCONS (elt)->cdr;
3451           if (CONSP (val))
3452             return val;
3453           if (! SYMBOLP (val))
3454             return Qnil;
3455           if (! NILP (Fcoding_system_p (val)))
3456             return Fcons (val, val);
3457           if (!NILP (Fboundp (val)))
3458             return call2 (val, Flist (nargs, args));
3459           return Qnil;
3460         }
3461     }
3462   return Qnil;
3463 }
3464
3465 #endif /* emacs */
3466
3467 \f
3468 /*** 8. Post-amble ***/
3469
3470 init_coding_once ()
3471 {
3472   int i;
3473
3474   /* Emacs' internal format specific initialize routine.  */
3475   for (i = 0; i <= 0x20; i++)
3476     emacs_code_class[i] = EMACS_control_code;
3477   emacs_code_class[0x0A] = EMACS_linefeed_code;
3478   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3479   for (i = 0x21 ; i < 0x7F; i++)
3480     emacs_code_class[i] = EMACS_ascii_code;
3481   emacs_code_class[0x7F] = EMACS_control_code;
3482   emacs_code_class[0x80] = EMACS_leading_code_composition;
3483   for (i = 0x81; i < 0xFF; i++)
3484     emacs_code_class[i] = EMACS_invalid_code;
3485   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3486   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3487   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3488   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3489
3490   /* ISO2022 specific initialize routine.  */
3491   for (i = 0; i < 0x20; i++)
3492     iso_code_class[i] = ISO_control_code;
3493   for (i = 0x21; i < 0x7F; i++)
3494     iso_code_class[i] = ISO_graphic_plane_0;
3495   for (i = 0x80; i < 0xA0; i++)
3496     iso_code_class[i] = ISO_control_code;
3497   for (i = 0xA1; i < 0xFF; i++)
3498     iso_code_class[i] = ISO_graphic_plane_1;
3499   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3500   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3501   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3502   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3503   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3504   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3505   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3506   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3507   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3508   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3509
3510   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3511   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3512
3513   setup_coding_system (Qnil, &keyboard_coding);
3514   setup_coding_system (Qnil, &terminal_coding);
3515 }
3516
3517 #ifdef emacs
3518
3519 syms_of_coding ()
3520 {
3521   Qtarget_idx = intern ("target-idx");
3522   staticpro (&Qtarget_idx);
3523
3524   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3525   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3526
3527   Qcall_process = intern ("call-process");
3528   staticpro (&Qcall_process);
3529   Fput (Qcall_process, Qtarget_idx, make_number (0));
3530
3531   Qcall_process_region = intern ("call-process-region");
3532   staticpro (&Qcall_process_region);
3533   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3534
3535   Qstart_process = intern ("start-process");
3536   staticpro (&Qstart_process);
3537   Fput (Qstart_process, Qtarget_idx, make_number (2));
3538
3539   Qopen_network_stream = intern ("open-network-stream");
3540   staticpro (&Qopen_network_stream);
3541   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3542
3543   Qcoding_system = intern ("coding-system");
3544   staticpro (&Qcoding_system);
3545
3546   Qeol_type = intern ("eol-type");
3547   staticpro (&Qeol_type);
3548
3549   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3550   staticpro (&Qbuffer_file_coding_system);
3551
3552   Qpost_read_conversion = intern ("post-read-conversion");
3553   staticpro (&Qpost_read_conversion);
3554
3555   Qpre_write_conversion = intern ("pre-write-conversion");
3556   staticpro (&Qpre_write_conversion);
3557
3558   Qcoding_system_spec = intern ("coding-system-spec");
3559   staticpro (&Qcoding_system_spec);
3560
3561   Qcoding_system_p = intern ("coding-system-p");
3562   staticpro (&Qcoding_system_p);
3563
3564   Qcoding_system_error = intern ("coding-system-error");
3565   staticpro (&Qcoding_system_error);
3566
3567   Fput (Qcoding_system_error, Qerror_conditions,
3568         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3569   Fput (Qcoding_system_error, Qerror_message,
3570         build_string ("Coding-system error"));
3571
3572   Qcoding_category_index = intern ("coding-category-index");
3573   staticpro (&Qcoding_category_index);
3574
3575   {
3576     int i;
3577     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3578       {
3579         coding_category_table[i] = intern (coding_category_name[i]);
3580         staticpro (&coding_category_table[i]);
3581         Fput (coding_category_table[i], Qcoding_category_index,
3582               make_number (i));
3583       }
3584   }
3585
3586   Qcharacter_unification_table = intern ("character-unification-table");
3587   staticpro (&Qcharacter_unification_table);
3588   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3589         make_number (0));
3590
3591   defsubr (&Scoding_system_spec);
3592   defsubr (&Scoding_system_p);
3593   defsubr (&Sread_coding_system);
3594   defsubr (&Sread_non_nil_coding_system);
3595   defsubr (&Scheck_coding_system);
3596   defsubr (&Sdetect_coding_region);
3597   defsubr (&Sdecode_coding_region);
3598   defsubr (&Sencode_coding_region);
3599   defsubr (&Sdecode_coding_string);
3600   defsubr (&Sencode_coding_string);
3601   defsubr (&Sdecode_sjis_char);
3602   defsubr (&Sencode_sjis_char);
3603   defsubr (&Sdecode_big5_char);
3604   defsubr (&Sencode_big5_char);
3605   defsubr (&Sset_terminal_coding_system_internal);
3606   defsubr (&Sterminal_coding_system);
3607   defsubr (&Sset_keyboard_coding_system_internal);
3608   defsubr (&Skeyboard_coding_system);
3609   defsubr (&Sfind_coding_system);
3610
3611   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3612     "List of coding-categories (symbols) ordered by priority.");
3613   {
3614     int i;
3615
3616     Vcoding_category_list = Qnil;
3617     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3618       Vcoding_category_list
3619         = Fcons (coding_category_table[i], Vcoding_category_list);
3620   }
3621
3622   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3623     "A variable of internal use only.\n\
3624 If the value is a coding system, it is used for decoding on read operation.\n\
3625 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3626   Vcoding_system_for_read = Qnil;
3627
3628   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3629     "A variable of internal use only.\n\
3630 If the value is a coding system, it is used for encoding on write operation.\n\
3631 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3632   Vcoding_system_for_write = Qnil;
3633
3634   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3635     "Coding-system used in the latest file or process I/O.");
3636   Vlast_coding_system_used = Qnil;
3637
3638   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3639     "Alist to decide a coding system to use for a file I/O operation.\n\
3640 The format is ((PATTERN . VAL) ...),\n\
3641 where PATTERN is a regular expression matching a file name,\n\
3642 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3643 If VAL is a coding system, it is used for both decoding and encoding\n\
3644 the file contents.\n\
3645 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3646 and the cdr part is used for encoding.\n\
3647 If VAL is a function symbol, the function must return a coding system\n\
3648 or a cons of coding systems which are used as above.\n\
3649 \n\
3650 See also the function `find-coding-system'.");
3651   Vfile_coding_system_alist = Qnil;
3652
3653   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3654     "Alist to decide a coding system to use for a process I/O operation.\n\
3655 The format is ((PATTERN . VAL) ...),\n\
3656 where PATTERN is a regular expression matching a program name,\n\
3657 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3658 If VAL is a coding system, it is used for both decoding what received\n\
3659 from the program and encoding what sent to the program.\n\
3660 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3661 and the cdr part is used for encoding.\n\
3662 If VAL is a function symbol, the function must return a coding system\n\
3663 or a cons of coding systems which are used as above.\n\
3664 \n\
3665 See also the function `find-coding-system'.");
3666   Vprocess_coding_system_alist = Qnil;
3667
3668   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3669     "Alist to decide a coding system to use for a network I/O operation.\n\
3670 The format is ((PATTERN . VAL) ...),\n\
3671 where PATTERN is a regular expression matching a network service name\n\
3672 or is a port number to connect to,\n\
3673 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3674 If VAL is a coding system, it is used for both decoding what received\n\
3675 from the network stream and encoding what sent to the network stream.\n\
3676 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3677 and the cdr part is used for encoding.\n\
3678 If VAL is a function symbol, the function must return a coding system\n\
3679 or a cons of coding systems which are used as above.\n\
3680 \n\
3681 See also the function `find-coding-system'.");
3682   Vnetwork_coding_system_alist = Qnil;
3683
3684   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3685     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3686   eol_mnemonic_unix = ':';
3687
3688   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3689     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3690   eol_mnemonic_dos = '\\';
3691
3692   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3693     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3694   eol_mnemonic_mac = '/';
3695
3696   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3697     "Mnemonic character indicating end-of-line format is not yet decided.");
3698   eol_mnemonic_undecided = ':';
3699
3700   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3701     "Non-nil means ISO 2022 encoder/decoder do character unification.");
3702   Venable_character_unification = Qt;
3703
3704   DEFVAR_LISP ("standard-character-unification-table-for-read",
3705     &Vstandard_character_unification_table_for_read,
3706     "Table for unifying characters when reading.");
3707   Vstandard_character_unification_table_for_read = Qnil;
3708
3709   DEFVAR_LISP ("standard-character-unification-table-for-write",
3710     &Vstandard_character_unification_table_for_write,
3711     "Table for unifying characters when writing.");
3712   Vstandard_character_unification_table_for_write = Qnil;
3713
3714   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3715     "Alist of charsets vs revision numbers.\n\
3716 While encoding, if a charset (car part of an element) is found,\n\
3717 designate it with the escape sequence identifing revision (cdr part of the element).");
3718   Vcharset_revision_alist = Qnil;
3719
3720   DEFVAR_LISP ("default-process-coding-system",
3721                &Vdefault_process_coding_system,
3722     "Cons of coding systems used for process I/O by default.\n\
3723 The car part is used for decoding a process output,\n\
3724 the cdr part is used for encoding a text to be sent to a process.");
3725   Vdefault_process_coding_system = Qnil;
3726 }
3727
3728 #endif /* emacs */