src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Raw text
  71
  72   A coding system to for a text containing random 8-bit code.  Emacs
  73   does no code conversion on such a text except for end-of-line
  74   format.
  75
  76   5. Other
  77
  78   If a user wants to read/write a text encoded in a coding system not
  79   listed above, he can supply a decoder and an encoder for it in CCL
  80   (Code Conversion Language) programs.  Emacs executes the CCL program
  81   while reading/writing.
  82
  83   Emacs represents a coding-system by a Lisp symbol that has a property
  84   `coding-system'.  But, before actually using the coding-system, the
  85   information about it is set in a structure of type `struct
  86   coding_system' for rapid processing.  See section 6 for more details.
  87
  88 */
  89
  90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  91
  92   How end-of-line of a text is encoded depends on a system.  For
  93   instance, Unix's format is just one byte of `line-feed' code,
  94   whereas DOS's format is two-byte sequence of `carriage-return' and
  95   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  96
  97   Since text characters encoding and end-of-line encoding are
  98   independent, any coding system described above can take
  99   any format of end-of-line.  So, Emacs has information of format of
 100   end-of-line in each coding-system.  See section 6 for more details.
 101
 102 */
 103
 104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 105
 106   These functions check if a text between SRC and SRC_END is encoded
 107   in the coding system category XXX.  Each returns an integer value in
 108   which appropriate flag bits for the category XXX is set.  The flag
 109   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 110   template of these functions.  */
 111 #if 0
 112 int
 113 detect_coding_emacs_mule (src, src_end)
 114      unsigned char *src, *src_end;
 115 {
 116   ...
 117 }
 118 #endif
 119
 120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 121
 122   These functions decode SRC_BYTES length text at SOURCE encoded in
 123   CODING to Emacs' internal format (emacs-mule).  The resulting text
 124   goes to a place pointed to by DESTINATION, the length of which should
 125   not exceed DST_BYTES.  The number of bytes actually processed is
 126   returned as *CONSUMED.  The return value is the length of the decoded
 127   text.  Below is a template of these functions.  */
 128 #if 0
 129 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 130      struct coding_system *coding;
 131      unsigned char *source, *destination;
 132      int src_bytes, dst_bytes;
 133      int *consumed;
 134 {
 135   ...
 136 }
 137 #endif
 138
 139 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 140
 141   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 142   internal format (emacs-mule) to CODING.  The resulting text goes to
 143   a place pointed to by DESTINATION, the length of which should not
 144   exceed DST_BYTES.  The number of bytes actually processed is
 145   returned as *CONSUMED.  The return value is the length of the
 146   encoded text.  Below is a template of these functions.  */
 147 #if 0
 148 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152      int *consumed;
 153 {
 154   ...
 155 }
 156 #endif
 157
 158 /*** COMMONLY USED MACROS ***/
 159
 160 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 161    THREE_MORE_BYTES safely get one, two, and three bytes from the
 162    source text respectively.  If there are not enough bytes in the
 163    source, they jump to `label_end_of_loop'.  The caller should set
 164    variables `src' and `src_end' to appropriate areas in advance.  */
 165
 166 #define ONE_MORE_BYTE(c1)       \
 167   do {                          \
 168     if (src < src_end)          \
 169       c1 = *src++;              \
 170     else                        \
 171       goto label_end_of_loop;   \
 172   } while (0)
 173
 174 #define TWO_MORE_BYTES(c1, c2)  \
 175   do {                          \
 176     if (src + 1 < src_end)      \
 177       c1 = *src++, c2 = *src++; \
 178     else                        \
 179       goto label_end_of_loop;   \
 180   } while (0)
 181
 182 #define THREE_MORE_BYTES(c1, c2, c3)            \
 183   do {                                          \
 184     if (src + 2 < src_end)                      \
 185       c1 = *src++, c2 = *src++, c3 = *src++;    \
 186     else                                        \
 187       goto label_end_of_loop;                   \
 188   } while (0)
 189
 190 /* The following three macros DECODE_CHARACTER_ASCII,
 191    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 192    the multi-byte form of a character of each class at the place
 193    pointed by `dst'.  The caller should set the variable `dst' to
 194    point to an appropriate area and the variable `coding' to point to
 195    the coding-system of the currently decoding text in advance.  */
 196
 197 /* Decode one ASCII character C.  */
 198
 199 #define DECODE_CHARACTER_ASCII(c)                               \
 200   do {                                                          \
 201     if (COMPOSING_P (coding->composing))                        \
 202       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 203     else                                                        \
 204       *dst++ = (c);                                             \
 205   } while (0)
 206
 207 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 208    position-code is C.  */
 209
 210 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 211   do {                                                                  \
 212     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 213     if (COMPOSING_P (coding->composing))                                \
 214       *dst++ = leading_code + 0x20;                                     \
 215     else                                                                \
 216       *dst++ = leading_code;                                            \
 217     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 218       *dst++ = leading_code;                                            \
 219     *dst++ = (c) | 0x80;                                                \
 220   } while (0)
 221
 222 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 223    position-codes are C1 and C2.  */
 224
 225 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 226   do {                                                  \
 227     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 228     *dst++ = (c2) | 0x80;                               \
 229   } while (0)
 230
 231 \f
 232 /*** 1. Preamble ***/
 233
 234 #include <stdio.h>
 235
 236 #ifdef emacs
 237
 238 #include <config.h>
 239 #include "lisp.h"
 240 #include "buffer.h"
 241 #include "charset.h"
 242 #include "ccl.h"
 243 #include "coding.h"
 244 #include "window.h"
 245
 246 #else  /* not emacs */
 247
 248 #include "mulelib.h"
 249
 250 #endif /* not emacs */
 251
 252 Lisp_Object Qcoding_system, Qeol_type;
 253 Lisp_Object Qbuffer_file_coding_system;
 254 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 255 Lisp_Object Qno_conversion, Qundecided;
 256 Lisp_Object Qcoding_system_history;
 257
 258 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 259 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 260 Lisp_Object Qstart_process, Qopen_network_stream;
 261 Lisp_Object Qtarget_idx;
 262
 263 /* Mnemonic character of each format of end-of-line.  */
 264 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 265 /* Mnemonic character to indicate format of end-of-line is not yet
 266    decided.  */
 267 int eol_mnemonic_undecided;
 268
 269 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 270    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 271 int system_eol_type;
 272
 273 #ifdef emacs
 274
 275 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 276
 277 /* Coding system emacs-mule is for converting only end-of-line format.  */
 278 Lisp_Object Qemacs_mule;
 279
 280 /* Coding-systems are handed between Emacs Lisp programs and C internal
 281    routines by the following three variables.  */
 282 /* Coding-system for reading files and receiving data from process.  */
 283 Lisp_Object Vcoding_system_for_read;
 284 /* Coding-system for writing files and sending data to process.  */
 285 Lisp_Object Vcoding_system_for_write;
 286 /* Coding-system actually used in the latest I/O.  */
 287 Lisp_Object Vlast_coding_system_used;
 288
 289 /* A vector of length 256 which contains information about special
 290    Latin codes (espepcially for dealing with Microsoft code).  */
 291 Lisp_Object Vlatin_extra_code_table;
 292
 293 /* Flag to inhibit code conversion of end-of-line format.  */
 294 int inhibit_eol_conversion;
 295
 296 /* Coding system to be used to encode text for terminal display.  */
 297 struct coding_system terminal_coding;
 298
 299 /* Coding system to be used to encode text for terminal display when
 300    terminal coding system is nil.  */
 301 struct coding_system safe_terminal_coding;
 302
 303 /* Coding system of what is sent from terminal keyboard.  */
 304 struct coding_system keyboard_coding;
 305
 306 Lisp_Object Vfile_coding_system_alist;
 307 Lisp_Object Vprocess_coding_system_alist;
 308 Lisp_Object Vnetwork_coding_system_alist;
 309
 310 #endif /* emacs */
 311
 312 Lisp_Object Qcoding_category_index;
 313
 314 /* List of symbols `coding-category-xxx' ordered by priority.  */
 315 Lisp_Object Vcoding_category_list;
 316
 317 /* Table of coding-systems currently assigned to each coding-category.  */
 318 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 319
 320 /* Table of names of symbol for each coding-category.  */
 321 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 322   "coding-category-emacs-mule",
 323   "coding-category-sjis",
 324   "coding-category-iso-7",
 325   "coding-category-iso-8-1",
 326   "coding-category-iso-8-2",
 327   "coding-category-iso-7-else",
 328   "coding-category-iso-8-else",
 329   "coding-category-big5",
 330   "coding-category-raw-text",
 331   "coding-category-binary"
 332 };
 333
 334 /* Flag to tell if we look up unification table on character code
 335    conversion.  */
 336 Lisp_Object Venable_character_unification;
 337 /* Standard unification table to look up on decoding (reading).  */
 338 Lisp_Object Vstandard_character_unification_table_for_decode;
 339 /* Standard unification table to look up on encoding (writing).  */
 340 Lisp_Object Vstandard_character_unification_table_for_encode;
 341
 342 Lisp_Object Qcharacter_unification_table;
 343 Lisp_Object Qcharacter_unification_table_for_decode;
 344 Lisp_Object Qcharacter_unification_table_for_encode;
 345
 346 /* Alist of charsets vs revision number.  */
 347 Lisp_Object Vcharset_revision_alist;
 348
 349 /* Default coding systems used for process I/O.  */
 350 Lisp_Object Vdefault_process_coding_system;
 351
 352 \f
 353 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 354
 355 /* Emacs' internal format for encoding multiple character sets is a
 356    kind of multi-byte encoding, i.e. characters are encoded by
 357    variable-length sequences of one-byte codes.  ASCII characters
 358    and control characters (e.g. `tab', `newline') are represented by
 359    one-byte sequences which are their ASCII codes, in the range 0x00
 360    through 0x7F.  The other characters are represented by a sequence
 361    of `base leading-code', optional `extended leading-code', and one
 362    or two `position-code's.  The length of the sequence is determined
 363    by the base leading-code.  Leading-code takes the range 0x80
 364    through 0x9F, whereas extended leading-code and position-code take
 365    the range 0xA0 through 0xFF.  See `charset.h' for more details
 366    about leading-code and position-code.
 367
 368    There's one exception to this rule.  Special leading-code
 369    `leading-code-composition' denotes that the following several
 370    characters should be composed into one character.  Leading-codes of
 371    components (except for ASCII) are added 0x20.  An ASCII character
 372    component is represented by a 2-byte sequence of `0xA0' and
 373    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 374    details of composite character.  Hence, we can summarize the code
 375    range as follows:
 376
 377    --- CODE RANGE of Emacs' internal format ---
 378    (character set)      (range)
 379    ASCII                0x00 .. 0x7F
 380    ELSE (1st byte)      0x80 .. 0x9F
 381         (rest bytes)    0xA0 .. 0xFF
 382    ---------------------------------------------
 383
 384   */
 385
 386 enum emacs_code_class_type emacs_code_class[256];
 387
 388 /* Go to the next statement only if *SRC is accessible and the code is
 389    greater than 0xA0.  */
 390 #define CHECK_CODE_RANGE_A0_FF  \
 391   do {                          \
 392     if (src >= src_end)         \
 393       goto label_end_of_switch; \
 394     else if (*src++ < 0xA0)     \
 395       return 0;                 \
 396   } while (0)
 397
 398 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 399    Check if a text is encoded in Emacs' internal format.  If it is,
 400    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 401
 402 int
 403 detect_coding_emacs_mule (src, src_end)
 404      unsigned char *src, *src_end;
 405 {
 406   unsigned char c;
 407   int composing = 0;
 408
 409   while (src < src_end)
 410     {
 411       c = *src++;
 412
 413       if (composing)
 414         {
 415           if (c < 0xA0)
 416             composing = 0;
 417           else
 418             c -= 0x20;
 419         }
 420
 421       switch (emacs_code_class[c])
 422         {
 423         case EMACS_ascii_code:
 424         case EMACS_linefeed_code:
 425           break;
 426
 427         case EMACS_control_code:
 428           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 429             return 0;
 430           break;
 431
 432         case EMACS_invalid_code:
 433           return 0;
 434
 435         case EMACS_leading_code_composition: /* c == 0x80 */
 436           if (composing)
 437             CHECK_CODE_RANGE_A0_FF;
 438           else
 439             composing = 1;
 440           break;
 441
 442         case EMACS_leading_code_4:
 443           CHECK_CODE_RANGE_A0_FF;
 444           /* fall down to check it two more times ...  */
 445
 446         case EMACS_leading_code_3:
 447           CHECK_CODE_RANGE_A0_FF;
 448           /* fall down to check it one more time ...  */
 449
 450         case EMACS_leading_code_2:
 451           CHECK_CODE_RANGE_A0_FF;
 452           break;
 453
 454         default:
 455         label_end_of_switch:
 456           break;
 457         }
 458     }
 459   return CODING_CATEGORY_MASK_EMACS_MULE;
 460 }
 461
 462 \f
 463 /*** 3. ISO2022 handlers ***/
 464
 465 /* The following note describes the coding system ISO2022 briefly.
 466    Since the intention of this note is to help in understanding of
 467    the programs in this file, some parts are NOT ACCURATE or OVERLY
 468    SIMPLIFIED.  For the thorough understanding, please refer to the
 469    original document of ISO2022.
 470
 471    ISO2022 provides many mechanisms to encode several character sets
 472    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 473    all text is encoded by codes of less than 128.  This may make the
 474    encoded text a little bit longer, but the text gets more stability
 475    to pass through several gateways (some of them strip off the MSB).
 476
 477    There are two kinds of character set: control character set and
 478    graphic character set.  The former contains control characters such
 479    as `newline' and `escape' to provide control functions (control
 480    functions are provided also by escape sequences).  The latter
 481    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 482    two control character sets and many graphic character sets.
 483
 484    Graphic character sets are classified into one of the following
 485    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 486    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 487    bytes (DIMENSION) and the number of characters in one dimension
 488    (CHARS) of the set.  In addition, each character set is assigned an
 489    identification tag (called "final character" and denoted as <F>
 490    here after) which is unique in each class.  <F> of each character
 491    set is decided by ECMA(*) when it is registered in ISO.  Code range
 492    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 493
 494    Note (*): ECMA = European Computer Manufacturers Association
 495
 496    Here are examples of graphic character set [NAME(<F>)]:
 497         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 498         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 499         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 500         o DIMENSION2_CHARS96 -- none for the moment
 501
 502    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 503         C0 [0x00..0x1F] -- control character plane 0
 504         GL [0x20..0x7F] -- graphic character plane 0
 505         C1 [0x80..0x9F] -- control character plane 1
 506         GR [0xA0..0xFF] -- graphic character plane 1
 507
 508    A control character set is directly designated and invoked to C0 or
 509    C1 by an escape sequence.  The most common case is that ISO646's
 510    control character set is designated/invoked to C0 and ISO6429's
 511    control character set is designated/invoked to C1, and usually
 512    these designations/invocations are omitted in a coded text.  With
 513    7-bit environment, only C0 can be used, and a control character for
 514    C1 is encoded by an appropriate escape sequence to fit in the
 515    environment.  All control characters for C1 are defined the
 516    corresponding escape sequences.
 517
 518    A graphic character set is at first designated to one of four
 519    graphic registers (G0 through G3), then these graphic registers are
 520    invoked to GL or GR.  These designations and invocations can be
 521    done independently.  The most common case is that G0 is invoked to
 522    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 523    these invocations and designations are omitted in a coded text.
 524    With 7-bit environment, only GL can be used.
 525
 526    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 527    and 0x7F of GL area work as control characters SPACE and DEL
 528    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 529
 530    There are two ways of invocation: locking-shift and single-shift.
 531    With locking-shift, the invocation lasts until the next different
 532    invocation, whereas with single-shift, the invocation works only
 533    for the following character and doesn't affect locking-shift.
 534    Invocations are done by the following control characters or escape
 535    sequences.
 536
 537    ----------------------------------------------------------------------
 538    function             control char    escape sequence description
 539    ----------------------------------------------------------------------
 540    SI  (shift-in)               0x0F    none            invoke G0 to GL
 541    SO  (shift-out)              0x0E    none            invoke G1 to GL
 542    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 543    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 544    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 545    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 546    ----------------------------------------------------------------------
 547    The first four are for locking-shift.  Control characters for these
 548    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 549
 550    Designations are done by the following escape sequences.
 551    ----------------------------------------------------------------------
 552    escape sequence      description
 553    ----------------------------------------------------------------------
 554    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 555    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 556    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 557    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 558    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 559    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 560    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 561    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 562    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 563    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 564    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 565    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 566    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 567    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 568    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 569    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 570    ----------------------------------------------------------------------
 571
 572    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 573    of dimension 1, chars 94, and final character <F>, and etc.
 574
 575    Note (*): Although these designations are not allowed in ISO2022,
 576    Emacs accepts them on decoding, and produces them on encoding
 577    CHARS96 character set in a coding system which is characterized as
 578    7-bit environment, non-locking-shift, and non-single-shift.
 579
 580    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 581    '(' can be omitted.  We call this as "short-form" here after.
 582
 583    Now you may notice that there are a lot of ways for encoding the
 584    same multilingual text in ISO2022.  Actually, there exists many
 585    coding systems such as Compound Text (used in X's inter client
 586    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 587    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 588    localized platforms), and all of these are variants of ISO2022.
 589
 590    In addition to the above, Emacs handles two more kinds of escape
 591    sequences: ISO6429's direction specification and Emacs' private
 592    sequence for specifying character composition.
 593
 594    ISO6429's direction specification takes the following format:
 595         o CSI ']'      -- end of the current direction
 596         o CSI '0' ']'  -- end of the current direction
 597         o CSI '1' ']'  -- start of left-to-right text
 598         o CSI '2' ']'  -- start of right-to-left text
 599    The control character CSI (0x9B: control sequence introducer) is
 600    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 601
 602    Character composition specification takes the following format:
 603         o ESC '0' -- start character composition
 604         o ESC '1' -- end character composition
 605    Since these are not standard escape sequences of any ISO, the use
 606    of them for these meaning is restricted to Emacs only.  */
 607
 608 enum iso_code_class_type iso_code_class[256];
 609
 610 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 611    Check if a text is encoded in ISO2022.  If it is, returns an
 612    integer in which appropriate flag bits any of:
 613         CODING_CATEGORY_MASK_ISO_7
 614         CODING_CATEGORY_MASK_ISO_8_1
 615         CODING_CATEGORY_MASK_ISO_8_2
 616         CODING_CATEGORY_MASK_ISO_7_ELSE
 617         CODING_CATEGORY_MASK_ISO_8_ELSE
 618    are set.  If a code which should never appear in ISO2022 is found,
 619    returns 0.  */
 620
 621 int
 622 detect_coding_iso2022 (src, src_end)
 623      unsigned char *src, *src_end;
 624 {
 625   int mask = (CODING_CATEGORY_MASK_ISO_7
 626               | CODING_CATEGORY_MASK_ISO_8_1
 627               | CODING_CATEGORY_MASK_ISO_8_2
 628               | CODING_CATEGORY_MASK_ISO_7_ELSE
 629               | CODING_CATEGORY_MASK_ISO_8_ELSE
 630               );
 631   int g1 = 0;                   /* 1 iff designating to G1.  */
 632   int c, i;
 633   struct coding_system coding_iso_8_1, coding_iso_8_2;
 634
 635   /* Coding systems of these categories may accept latin extra codes.  */
 636   setup_coding_system
 637     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
 638      &coding_iso_8_1);
 639   setup_coding_system
 640     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
 641      &coding_iso_8_2);
 642
 643   while (mask && src < src_end)
 644     {
 645       c = *src++;
 646       switch (c)
 647         {
 648         case ISO_CODE_ESC:
 649           if (src >= src_end)
 650             break;
 651           c = *src++;
 652           if ((c >= '(' && c <= '/'))
 653             {
 654               /* Designation sequence for a charset of dimension 1.  */
 655               if (src >= src_end)
 656                 break;
 657               c = *src++;
 658               if (c < ' ' || c >= 0x80)
 659                 /* Invalid designation sequence.  */
 660                 return 0;
 661             }
 662           else if (c == '$')
 663             {
 664               /* Designation sequence for a charset of dimension 2.  */
 665               if (src >= src_end)
 666                 break;
 667               c = *src++;
 668               if (c >= '@' && c <= 'B')
 669                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 670                 ;
 671               else if (c >= '(' && c <= '/')
 672                 {
 673                   if (src >= src_end)
 674                     break;
 675                   c = *src++;
 676                   if (c < ' ' || c >= 0x80)
 677                     /* Invalid designation sequence.  */
 678                     return 0;
 679                 }
 680               else
 681                 /* Invalid designation sequence.  */
 682                 return 0;
 683             }
 684           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 685             /* Locking shift.  */
 686             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 687                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 688           else if (c == '0' || c == '1' || c == '2')
 689             /* Start/end composition.  */
 690             ;
 691           else
 692             /* Invalid escape sequence.  */
 693             return 0;
 694           break;
 695
 696         case ISO_CODE_SO:
 697           mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 698                    | CODING_CATEGORY_MASK_ISO_8_ELSE);
 699           break;
 700
 701         case ISO_CODE_CSI:
 702         case ISO_CODE_SS2:
 703         case ISO_CODE_SS3:
 704           {
 705             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 706
 707             if (VECTORP (Vlatin_extra_code_table)
 708                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 709               {
 710                 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 711                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 712                 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 713                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 714               }
 715             mask &= newmask;
 716           }
 717           break;
 718
 719         default:
 720           if (c < 0x80)
 721             break;
 722           else if (c < 0xA0)
 723             {
 724               if (VECTORP (Vlatin_extra_code_table)
 725                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 726                 {
 727                   int newmask = 0;
 728
 729                   if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 730                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 731                   if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 732                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 733                   mask &= newmask;
 734                 }
 735               else
 736                 return 0;
 737             }
 738           else
 739             {
 740               unsigned char *src_begin = src;
 741
 742               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 743                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 744               while (src < src_end && *src >= 0xA0)
 745                 src++;
 746               if ((src - src_begin - 1) & 1 && src < src_end)
 747                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 748             }
 749           break;
 750         }
 751     }
 752
 753   return mask;
 754 }
 755
 756 /* Decode a character of which charset is CHARSET and the 1st position
 757    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 758    fetched from SRC and set to C2.  If CHARSET is negative, it means
 759    that we are decoding ill formed text, and what we can do is just to
 760    read C1 as is.  */
 761
 762 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 763   do {                                                                  \
 764     int c_alt, charset_alt = (charset);                                 \
 765     if (COMPOSING_HEAD_P (coding->composing))                           \
 766       {                                                                 \
 767         *dst++ = LEADING_CODE_COMPOSITION;                              \
 768         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 769           /* To tell composition rules are embeded.  */                 \
 770           *dst++ = 0xFF;                                                \
 771         coding->composing += 2;                                         \
 772       }                                                                 \
 773     if ((charset) >= 0)                                                 \
 774       {                                                                 \
 775         if (CHARSET_DIMENSION (charset) == 2)                           \
 776           ONE_MORE_BYTE (c2);                                           \
 777         if (!NILP (unification_table)                                   \
 778             && ((c_alt = unify_char (unification_table,                 \
 779                                      -1, (charset), c1, c2)) >= 0))     \
 780           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 781       }                                                                 \
 782     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 783       DECODE_CHARACTER_ASCII (c1);                                      \
 784     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 785       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 786     else                                                                \
 787       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 788     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 789       /* To tell a composition rule follows.  */                        \
 790       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 791   } while (0)
 792
 793 /* Set designation state into CODING.  */
 794 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 795   do {                                                                  \
 796     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 797                                      make_number (chars),               \
 798                                      make_number (final_char));         \
 799     if (charset >= 0)                                                   \
 800       {                                                                 \
 801         if (coding->direction == 1                                      \
 802             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 803           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 804         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 805       }                                                                 \
 806   } while (0)
 807
 808 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 809
 810 int
 811 decode_coding_iso2022 (coding, source, destination,
 812                        src_bytes, dst_bytes, consumed)
 813      struct coding_system *coding;
 814      unsigned char *source, *destination;
 815      int src_bytes, dst_bytes;
 816      int *consumed;
 817 {
 818   unsigned char *src = source;
 819   unsigned char *src_end = source + src_bytes;
 820   unsigned char *dst = destination;
 821   unsigned char *dst_end = destination + dst_bytes;
 822   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 823      from DST_END to assure that overflow checking is necessary only
 824      at the head of loop.  */
 825   unsigned char *adjusted_dst_end = dst_end - 6;
 826   int charset;
 827   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 828   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 829   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 830   Lisp_Object unification_table
 831       = coding->character_unification_table_for_decode;
 832
 833   if (!NILP (Venable_character_unification) && NILP (unification_table))
 834     unification_table = Vstandard_character_unification_table_for_decode;
 835
 836   while (src < src_end && dst < adjusted_dst_end)
 837     {
 838       /* SRC_BASE remembers the start position in source in each loop.
 839          The loop will be exited when there's not enough source text
 840          to analyze long escape sequence or 2-byte code (within macros
 841          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 842          to SRC_BASE before exiting.  */
 843       unsigned char *src_base = src;
 844       int c1 = *src++, c2;
 845
 846       switch (iso_code_class [c1])
 847         {
 848         case ISO_0x20_or_0x7F:
 849           if (!coding->composing
 850               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 851             {
 852               /* This is SPACE or DEL.  */
 853               *dst++ = c1;
 854               break;
 855             }
 856           /* This is a graphic character, we fall down ...  */
 857
 858         case ISO_graphic_plane_0:
 859           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 860             {
 861               /* This is a composition rule.  */
 862               *dst++ = c1 | 0x80;
 863               coding->composing = COMPOSING_WITH_RULE_TAIL;
 864             }
 865           else
 866             DECODE_ISO_CHARACTER (charset0, c1);
 867           break;
 868
 869         case ISO_0xA0_or_0xFF:
 870           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 871             {
 872               /* Invalid code.  */
 873               *dst++ = c1;
 874               break;
 875             }
 876           /* This is a graphic character, we fall down ... */
 877
 878         case ISO_graphic_plane_1:
 879           DECODE_ISO_CHARACTER (charset1, c1);
 880           break;
 881
 882         case ISO_control_code:
 883           /* All ISO2022 control characters in this class have the
 884              same representation in Emacs internal format.  */
 885           *dst++ = c1;
 886           break;
 887
 888         case ISO_carriage_return:
 889           if (coding->eol_type == CODING_EOL_CR)
 890             {
 891               *dst++ = '\n';
 892             }
 893           else if (coding->eol_type == CODING_EOL_CRLF)
 894             {
 895               ONE_MORE_BYTE (c1);
 896               if (c1 == ISO_CODE_LF)
 897                 *dst++ = '\n';
 898               else
 899                 {
 900                   src--;
 901                   *dst++ = c1;
 902                 }
 903             }
 904           else
 905             {
 906               *dst++ = c1;
 907             }
 908           break;
 909
 910         case ISO_shift_out:
 911           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 912             goto label_invalid_escape_sequence;
 913           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 914           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 915           break;
 916
 917         case ISO_shift_in:
 918           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 919           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 920           break;
 921
 922         case ISO_single_shift_2_7:
 923         case ISO_single_shift_2:
 924           /* SS2 is handled as an escape sequence of ESC 'N' */
 925           c1 = 'N';
 926           goto label_escape_sequence;
 927
 928         case ISO_single_shift_3:
 929           /* SS2 is handled as an escape sequence of ESC 'O' */
 930           c1 = 'O';
 931           goto label_escape_sequence;
 932
 933         case ISO_control_sequence_introducer:
 934           /* CSI is handled as an escape sequence of ESC '[' ...  */
 935           c1 = '[';
 936           goto label_escape_sequence;
 937
 938         case ISO_escape:
 939           ONE_MORE_BYTE (c1);
 940         label_escape_sequence:
 941           /* Escape sequences handled by Emacs are invocation,
 942              designation, direction specification, and character
 943              composition specification.  */
 944           switch (c1)
 945             {
 946             case '&':           /* revision of following character set */
 947               ONE_MORE_BYTE (c1);
 948               if (!(c1 >= '@' && c1 <= '~'))
 949                 goto label_invalid_escape_sequence;
 950               ONE_MORE_BYTE (c1);
 951               if (c1 != ISO_CODE_ESC)
 952                 goto label_invalid_escape_sequence;
 953               ONE_MORE_BYTE (c1);
 954               goto label_escape_sequence;
 955
 956             case '$':           /* designation of 2-byte character set */
 957               ONE_MORE_BYTE (c1);
 958               if (c1 >= '@' && c1 <= 'B')
 959                 {       /* designation of JISX0208.1978, GB2312.1980,
 960                                    or JISX0208.1980 */
 961                   DECODE_DESIGNATION (0, 2, 94, c1);
 962                 }
 963               else if (c1 >= 0x28 && c1 <= 0x2B)
 964                 {       /* designation of DIMENSION2_CHARS94 character set */
 965                   ONE_MORE_BYTE (c2);
 966                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 967                 }
 968               else if (c1 >= 0x2C && c1 <= 0x2F)
 969                 {       /* designation of DIMENSION2_CHARS96 character set */
 970                   ONE_MORE_BYTE (c2);
 971                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 972                 }
 973               else
 974                 goto label_invalid_escape_sequence;
 975               break;
 976
 977             case 'n':           /* invocation of locking-shift-2 */
 978               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 979                 goto label_invalid_escape_sequence;
 980               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 981               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 982               break;
 983
 984             case 'o':           /* invocation of locking-shift-3 */
 985               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 986                 goto label_invalid_escape_sequence;
 987               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 988               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 989               break;
 990
 991             case 'N':           /* invocation of single-shift-2 */
 992               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 993                 goto label_invalid_escape_sequence;
 994               ONE_MORE_BYTE (c1);
 995               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 996               DECODE_ISO_CHARACTER (charset, c1);
 997               break;
 998
 999             case 'O':           /* invocation of single-shift-3 */
1000               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1001                 goto label_invalid_escape_sequence;
1002               ONE_MORE_BYTE (c1);
1003               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1004               DECODE_ISO_CHARACTER (charset, c1);
1005               break;
1006
1007             case '0':           /* start composing without embeded rules */
1008               coding->composing = COMPOSING_NO_RULE_HEAD;
1009               break;
1010
1011             case '1':           /* end composing */
1012               coding->composing = COMPOSING_NO;
1013               break;
1014
1015             case '2':           /* start composing with embeded rules */
1016               coding->composing = COMPOSING_WITH_RULE_HEAD;
1017               break;
1018
1019             case '[':           /* specification of direction */
1020               /* For the moment, nested direction is not supported.
1021                  So, the value of `coding->direction' is 0 or 1: 0
1022                  means left-to-right, 1 means right-to-left.  */
1023               ONE_MORE_BYTE (c1);
1024               switch (c1)
1025                 {
1026                 case ']':       /* end of the current direction */
1027                   coding->direction = 0;
1028
1029                 case '0':       /* end of the current direction */
1030                 case '1':       /* start of left-to-right direction */
1031                   ONE_MORE_BYTE (c1);
1032                   if (c1 == ']')
1033                     coding->direction = 0;
1034                   else
1035                     goto label_invalid_escape_sequence;
1036                   break;
1037
1038                 case '2':       /* start of right-to-left direction */
1039                   ONE_MORE_BYTE (c1);
1040                   if (c1 == ']')
1041                     coding->direction= 1;
1042                   else
1043                     goto label_invalid_escape_sequence;
1044                   break;
1045
1046                 default:
1047                   goto label_invalid_escape_sequence;
1048                 }
1049               break;
1050
1051             default:
1052               if (c1 >= 0x28 && c1 <= 0x2B)
1053                 {       /* designation of DIMENSION1_CHARS94 character set */
1054                   ONE_MORE_BYTE (c2);
1055                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1056                 }
1057               else if (c1 >= 0x2C && c1 <= 0x2F)
1058                 {       /* designation of DIMENSION1_CHARS96 character set */
1059                   ONE_MORE_BYTE (c2);
1060                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1061                 }
1062               else
1063                 {
1064                   goto label_invalid_escape_sequence;
1065                 }
1066             }
1067           /* We must update these variables now.  */
1068           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1069           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1070           break;
1071
1072         label_invalid_escape_sequence:
1073           {
1074             int length = src - src_base;
1075
1076             bcopy (src_base, dst, length);
1077             dst += length;
1078           }
1079         }
1080       continue;
1081
1082     label_end_of_loop:
1083       coding->carryover_size = src - src_base;
1084       bcopy (src_base, coding->carryover, coding->carryover_size);
1085       src = src_base;
1086       break;
1087     }
1088
1089   /* If this is the last block of the text to be decoded, we had
1090      better just flush out all remaining codes in the text although
1091      they are not valid characters.  */
1092   if (coding->last_block)
1093     {
1094       bcopy (src, dst, src_end - src);
1095       dst += (src_end - src);
1096       src = src_end;
1097     }
1098   *consumed = src - source;
1099   return dst - destination;
1100 }
1101
1102 /* ISO2022 encoding stuff.  */
1103
1104 /*
1105    It is not enough to say just "ISO2022" on encoding, we have to
1106    specify more details.  In Emacs, each coding-system of ISO2022
1107    variant has the following specifications:
1108         1. Initial designation to G0 thru G3.
1109         2. Allows short-form designation?
1110         3. ASCII should be designated to G0 before control characters?
1111         4. ASCII should be designated to G0 at end of line?
1112         5. 7-bit environment or 8-bit environment?
1113         6. Use locking-shift?
1114         7. Use Single-shift?
1115    And the following two are only for Japanese:
1116         8. Use ASCII in place of JIS0201-1976-Roman?
1117         9. Use JISX0208-1983 in place of JISX0208-1978?
1118    These specifications are encoded in `coding->flags' as flag bits
1119    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1120    details.
1121 */
1122
1123 /* Produce codes (escape sequence) for designating CHARSET to graphic
1124    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1125    the coding system CODING allows, produce designation sequence of
1126    short-form.  */
1127
1128 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1129   do {                                                                  \
1130     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1131     char *intermediate_char_94 = "()*+";                                \
1132     char *intermediate_char_96 = ",-./";                                \
1133     Lisp_Object temp                                                    \
1134       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1135     if (! NILP (temp))                                                  \
1136         {                                                               \
1137         *dst++ = ISO_CODE_ESC;                                          \
1138         *dst++ = '&';                                                   \
1139         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1140       }                                                                 \
1141     *dst++ = ISO_CODE_ESC;                                              \
1142     if (CHARSET_DIMENSION (charset) == 1)                               \
1143       {                                                                 \
1144         if (CHARSET_CHARS (charset) == 94)                              \
1145           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1146         else                                                            \
1147           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1148       }                                                                 \
1149     else                                                                \
1150       {                                                                 \
1151         *dst++ = '$';                                                   \
1152         if (CHARSET_CHARS (charset) == 94)                              \
1153           {                                                             \
1154             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1155                 || reg != 0                                             \
1156                 || final_char < '@' || final_char > 'B')                \
1157               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1158           }                                                             \
1159         else                                                            \
1160           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1161       }                                                                 \
1162     *dst++ = final_char;                                                \
1163     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1164   } while (0)
1165
1166 /* The following two macros produce codes (control character or escape
1167    sequence) for ISO2022 single-shift functions (single-shift-2 and
1168    single-shift-3).  */
1169
1170 #define ENCODE_SINGLE_SHIFT_2                           \
1171   do {                                                  \
1172     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1173       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1174     else                                                \
1175       *dst++ = ISO_CODE_SS2;                            \
1176     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1177   } while (0)
1178
1179 #define ENCODE_SINGLE_SHIFT_3                           \
1180   do {                                                  \
1181     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1182       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1183     else                                                \
1184       *dst++ = ISO_CODE_SS3;                            \
1185     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1186   } while (0)
1187
1188 /* The following four macros produce codes (control character or
1189    escape sequence) for ISO2022 locking-shift functions (shift-in,
1190    shift-out, locking-shift-2, and locking-shift-3).  */
1191
1192 #define ENCODE_SHIFT_IN                         \
1193   do {                                          \
1194     *dst++ = ISO_CODE_SI;                       \
1195     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1196   } while (0)
1197
1198 #define ENCODE_SHIFT_OUT                        \
1199   do {                                          \
1200     *dst++ = ISO_CODE_SO;                       \
1201     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1202   } while (0)
1203
1204 #define ENCODE_LOCKING_SHIFT_2                  \
1205   do {                                          \
1206     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1207     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1208   } while (0)
1209
1210 #define ENCODE_LOCKING_SHIFT_3                  \
1211   do {                                          \
1212     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1213     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1214   } while (0)
1215
1216 /* Produce codes for a DIMENSION1 character whose character set is
1217    CHARSET and whose position-code is C1.  Designation and invocation
1218    sequences are also produced in advance if necessary.  */
1219
1220
1221 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1222   do {                                                                  \
1223     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1224       {                                                                 \
1225         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1226           *dst++ = c1 & 0x7F;                                           \
1227         else                                                            \
1228           *dst++ = c1 | 0x80;                                           \
1229         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1230         break;                                                          \
1231       }                                                                 \
1232     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1233       {                                                                 \
1234         *dst++ = c1 & 0x7F;                                             \
1235         break;                                                          \
1236       }                                                                 \
1237     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1238       {                                                                 \
1239         *dst++ = c1 | 0x80;                                             \
1240         break;                                                          \
1241       }                                                                 \
1242     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1243              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1244       {                                                                 \
1245         /* We should not encode this character, instead produce one or  \
1246            two `?'s.  */                                                \
1247         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1248         if (CHARSET_WIDTH (charset) == 2)                               \
1249           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1250         break;                                                          \
1251       }                                                                 \
1252     else                                                                \
1253       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1254          must invoke it, or, at first, designate it to some graphic     \
1255          register.  Then repeat the loop to actually produce the        \
1256          character.  */                                                 \
1257       dst = encode_invocation_designation (charset, coding, dst);       \
1258   } while (1)
1259
1260 /* Produce codes for a DIMENSION2 character whose character set is
1261    CHARSET and whose position-codes are C1 and C2.  Designation and
1262    invocation codes are also produced in advance if necessary.  */
1263
1264 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1265   do {                                                                  \
1266     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1267       {                                                                 \
1268         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1269           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1270         else                                                            \
1271           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1272         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1273         break;                                                          \
1274       }                                                                 \
1275     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1276       {                                                                 \
1277         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1278         break;                                                          \
1279       }                                                                 \
1280     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1281       {                                                                 \
1282         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1283         break;                                                          \
1284       }                                                                 \
1285     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1286              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1287       {                                                                 \
1288         /* We should not encode this character, instead produce one or  \
1289            two `?'s.  */                                                \
1290         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1291         if (CHARSET_WIDTH (charset) == 2)                               \
1292           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1293         break;                                                          \
1294       }                                                                 \
1295     else                                                                \
1296       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1297          must invoke it, or, at first, designate it to some graphic     \
1298          register.  Then repeat the loop to actually produce the        \
1299          character.  */                                                 \
1300       dst = encode_invocation_designation (charset, coding, dst);       \
1301   } while (1)
1302
1303 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1304   do {                                                                    \
1305     int c_alt, charset_alt;                                               \
1306     if (!NILP (unification_table)                                         \
1307         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1308             >= 0))                                                        \
1309       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1310     else                                                                  \
1311       charset_alt = charset;                                              \
1312     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1313       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1314     else                                                                  \
1315       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1316   } while (0)
1317
1318 /* Produce designation and invocation codes at a place pointed by DST
1319    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1320    Return new DST.  */
1321
1322 unsigned char *
1323 encode_invocation_designation (charset, coding, dst)
1324      int charset;
1325      struct coding_system *coding;
1326      unsigned char *dst;
1327 {
1328   int reg;                      /* graphic register number */
1329
1330   /* At first, check designations.  */
1331   for (reg = 0; reg < 4; reg++)
1332     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1333       break;
1334
1335   if (reg >= 4)
1336     {
1337       /* CHARSET is not yet designated to any graphic registers.  */
1338       /* At first check the requested designation.  */
1339       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1340       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1341         /* Since CHARSET requests no special designation, designate it
1342            to graphic register 0.  */
1343         reg = 0;
1344
1345       ENCODE_DESIGNATION (charset, reg, coding);
1346     }
1347
1348   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1349       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1350     {
1351       /* Since the graphic register REG is not invoked to any graphic
1352          planes, invoke it to graphic plane 0.  */
1353       switch (reg)
1354         {
1355         case 0:                 /* graphic register 0 */
1356           ENCODE_SHIFT_IN;
1357           break;
1358
1359         case 1:                 /* graphic register 1 */
1360           ENCODE_SHIFT_OUT;
1361           break;
1362
1363         case 2:                 /* graphic register 2 */
1364           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1365             ENCODE_SINGLE_SHIFT_2;
1366           else
1367             ENCODE_LOCKING_SHIFT_2;
1368           break;
1369
1370         case 3:                 /* graphic register 3 */
1371           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1372             ENCODE_SINGLE_SHIFT_3;
1373           else
1374             ENCODE_LOCKING_SHIFT_3;
1375           break;
1376         }
1377     }
1378   return dst;
1379 }
1380
1381 /* The following two macros produce codes for indicating composition.  */
1382 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1383 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1384 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1385
1386 /* The following three macros produce codes for indicating direction
1387    of text.  */
1388 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1389   do {                                                  \
1390     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1391       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1392     else                                                \
1393       *dst++ = ISO_CODE_CSI;                            \
1394   } while (0)
1395
1396 #define ENCODE_DIRECTION_R2L    \
1397   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1398
1399 #define ENCODE_DIRECTION_L2R    \
1400   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1401
1402 /* Produce codes for designation and invocation to reset the graphic
1403    planes and registers to initial state.  */
1404 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1405   do {                                                                      \
1406     int reg;                                                                \
1407     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1408       ENCODE_SHIFT_IN;                                                      \
1409     for (reg = 0; reg < 4; reg++)                                           \
1410       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1411           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1412               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1413         ENCODE_DESIGNATION                                                  \
1414           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1415   } while (0)
1416
1417 /* Produce designation sequences of charsets in the line started from
1418    *SRC to a place pointed by DSTP.
1419
1420    If the current block ends before any end-of-line, we may fail to
1421    find all the necessary *designations.  */
1422 encode_designation_at_bol (coding, table, src, src_end, dstp)
1423      struct coding_system *coding;
1424      Lisp_Object table;
1425      unsigned char *src, *src_end, **dstp;
1426 {
1427   int charset, c, found = 0, reg;
1428   /* Table of charsets to be designated to each graphic register.  */
1429   int r[4];
1430   unsigned char *dst = *dstp;
1431
1432   for (reg = 0; reg < 4; reg++)
1433     r[reg] = -1;
1434
1435   while (src < src_end && *src != '\n' && found < 4)
1436     {
1437       int bytes = BYTES_BY_CHAR_HEAD (*src);
1438
1439       if (NILP (table))
1440         charset = CHARSET_AT (src);
1441       else
1442         {
1443           int c_alt;
1444           unsigned char c1, c2;
1445
1446           SPLIT_STRING(src, bytes, charset, c1, c2);
1447           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1448             charset = CHAR_CHARSET (c_alt);
1449         }
1450
1451       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1452       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1453         {
1454           found++;
1455           r[reg] = charset;
1456         }
1457
1458       src += bytes;
1459     }
1460
1461   if (found)
1462     {
1463       for (reg = 0; reg < 4; reg++)
1464         if (r[reg] >= 0
1465             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1466           ENCODE_DESIGNATION (r[reg], reg, coding);
1467       *dstp = dst;
1468     }
1469 }
1470
1471 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1472
1473 int
1474 encode_coding_iso2022 (coding, source, destination,
1475                        src_bytes, dst_bytes, consumed)
1476      struct coding_system *coding;
1477      unsigned char *source, *destination;
1478      int src_bytes, dst_bytes;
1479      int *consumed;
1480 {
1481   unsigned char *src = source;
1482   unsigned char *src_end = source + src_bytes;
1483   unsigned char *dst = destination;
1484   unsigned char *dst_end = destination + dst_bytes;
1485   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1486      from DST_END to assure overflow checking is necessary only at the
1487      head of loop.  */
1488   unsigned char *adjusted_dst_end = dst_end - 19;
1489   Lisp_Object unification_table
1490       = coding->character_unification_table_for_encode;
1491
1492   if (!NILP (Venable_character_unification) && NILP (unification_table))
1493     unification_table = Vstandard_character_unification_table_for_encode;
1494
1495   while (src < src_end && dst < adjusted_dst_end)
1496     {
1497       /* SRC_BASE remembers the start position in source in each loop.
1498          The loop will be exited when there's not enough source text
1499          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1500          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1501          reset to SRC_BASE before exiting.  */
1502       unsigned char *src_base = src;
1503       int charset, c1, c2, c3, c4;
1504
1505       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1506           && CODING_SPEC_ISO_BOL (coding))
1507         {
1508           /* We have to produce designation sequences if any now.  */
1509           encode_designation_at_bol (coding, unification_table,
1510                                      src, src_end, &dst);
1511           CODING_SPEC_ISO_BOL (coding) = 0;
1512         }
1513
1514       c1 = *src++;
1515       /* If we are seeing a component of a composite character, we are
1516          seeing a leading-code specially encoded for composition, or a
1517          composition rule if composing with rule.  We must set C1
1518          to a normal leading-code or an ASCII code.  If we are not at
1519          a composed character, we must reset the composition state.  */
1520       if (COMPOSING_P (coding->composing))
1521         {
1522           if (c1 < 0xA0)
1523             {
1524               /* We are not in a composite character any longer.  */
1525               coding->composing = COMPOSING_NO;
1526               ENCODE_COMPOSITION_END;
1527             }
1528           else
1529             {
1530               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1531                 {
1532                   *dst++ = c1 & 0x7F;
1533                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1534                   continue;
1535                 }
1536               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1537                 coding->composing = COMPOSING_WITH_RULE_RULE;
1538               if (c1 == 0xA0)
1539                 {
1540                   /* This is an ASCII component.  */
1541                   ONE_MORE_BYTE (c1);
1542                   c1 &= 0x7F;
1543                 }
1544               else
1545                 /* This is a leading-code of non ASCII component.  */
1546                 c1 -= 0x20;
1547             }
1548         }
1549
1550       /* Now encode one character.  C1 is a control character, an
1551          ASCII character, or a leading-code of multi-byte character.  */
1552       switch (emacs_code_class[c1])
1553         {
1554         case EMACS_ascii_code:
1555           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1556           break;
1557
1558         case EMACS_control_code:
1559           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1560             ENCODE_RESET_PLANE_AND_REGISTER;
1561           *dst++ = c1;
1562           break;
1563
1564         case EMACS_carriage_return_code:
1565           if (!coding->selective)
1566             {
1567               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1568                 ENCODE_RESET_PLANE_AND_REGISTER;
1569               *dst++ = c1;
1570               break;
1571             }
1572           /* fall down to treat '\r' as '\n' ...  */
1573
1574         case EMACS_linefeed_code:
1575           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1576             ENCODE_RESET_PLANE_AND_REGISTER;
1577           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1578             bcopy (coding->spec.iso2022.initial_designation,
1579                    coding->spec.iso2022.current_designation,
1580                    sizeof coding->spec.iso2022.initial_designation);
1581           if (coding->eol_type == CODING_EOL_LF
1582               || coding->eol_type == CODING_EOL_UNDECIDED)
1583             *dst++ = ISO_CODE_LF;
1584           else if (coding->eol_type == CODING_EOL_CRLF)
1585             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1586           else
1587             *dst++ = ISO_CODE_CR;
1588           CODING_SPEC_ISO_BOL (coding) = 1;
1589           break;
1590
1591         case EMACS_leading_code_2:
1592           ONE_MORE_BYTE (c2);
1593           if (c2 < 0xA0)
1594             {
1595               /* invalid sequence */
1596               *dst++ = c1;
1597               *dst++ = c2;
1598             }
1599           else
1600             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1601           break;
1602
1603         case EMACS_leading_code_3:
1604           TWO_MORE_BYTES (c2, c3);
1605           if (c2 < 0xA0 || c3 < 0xA0)
1606             {
1607               /* invalid sequence */
1608               *dst++ = c1;
1609               *dst++ = c2;
1610               *dst++ = c3;
1611             }
1612           else if (c1 < LEADING_CODE_PRIVATE_11)
1613             ENCODE_ISO_CHARACTER (c1, c2, c3);
1614           else
1615             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1616           break;
1617
1618         case EMACS_leading_code_4:
1619           THREE_MORE_BYTES (c2, c3, c4);
1620           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1621             {
1622               /* invalid sequence */
1623               *dst++ = c1;
1624               *dst++ = c2;
1625               *dst++ = c3;
1626               *dst++ = c4;
1627             }
1628           else
1629             ENCODE_ISO_CHARACTER (c2, c3, c4);
1630           break;
1631
1632         case EMACS_leading_code_composition:
1633           ONE_MORE_BYTE (c2);
1634           if (c2 < 0xA0)
1635             {
1636               /* invalid sequence */
1637               *dst++ = c1;
1638               *dst++ = c2;
1639             }
1640           else if (c2 == 0xFF)
1641             {
1642               coding->composing = COMPOSING_WITH_RULE_HEAD;
1643               ENCODE_COMPOSITION_WITH_RULE_START;
1644             }
1645           else
1646             {
1647               /* Rewind one byte because it is a character code of
1648                  composition elements.  */
1649               src--;
1650               coding->composing = COMPOSING_NO_RULE_HEAD;
1651               ENCODE_COMPOSITION_NO_RULE_START;
1652             }
1653           break;
1654
1655         case EMACS_invalid_code:
1656           *dst++ = c1;
1657           break;
1658         }
1659       continue;
1660     label_end_of_loop:
1661       /* We reach here because the source date ends not at character
1662          boundary.  */
1663       coding->carryover_size = src_end - src_base;
1664       bcopy (src_base, coding->carryover, coding->carryover_size);
1665       src = src_end;
1666       break;
1667     }
1668
1669   /* If this is the last block of the text to be encoded, we must
1670      reset graphic planes and registers to the initial state.  */
1671   if (src >= src_end && coding->last_block)
1672     {
1673       ENCODE_RESET_PLANE_AND_REGISTER;
1674       if (coding->carryover_size > 0
1675           && coding->carryover_size < (dst_end - dst))
1676         {
1677           bcopy (coding->carryover, dst, coding->carryover_size);
1678           dst += coding->carryover_size;
1679           coding->carryover_size = 0;
1680         }
1681     }
1682   *consumed = src - source;
1683   return dst - destination;
1684 }
1685
1686 \f
1687 /*** 4. SJIS and BIG5 handlers ***/
1688
1689 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1690    quite widely.  So, for the moment, Emacs supports them in the bare
1691    C code.  But, in the future, they may be supported only by CCL.  */
1692
1693 /* SJIS is a coding system encoding three character sets: ASCII, right
1694    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1695    as is.  A character of charset katakana-jisx0201 is encoded by
1696    "position-code + 0x80".  A character of charset japanese-jisx0208
1697    is encoded in 2-byte but two position-codes are divided and shifted
1698    so that it fit in the range below.
1699
1700    --- CODE RANGE of SJIS ---
1701    (character set)      (range)
1702    ASCII                0x00 .. 0x7F
1703    KATAKANA-JISX0201    0xA0 .. 0xDF
1704    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1705             (2nd byte)  0x40 .. 0xFF
1706    -------------------------------
1707
1708 */
1709
1710 /* BIG5 is a coding system encoding two character sets: ASCII and
1711    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1712    character set and is encoded in two-byte.
1713
1714    --- CODE RANGE of BIG5 ---
1715    (character set)      (range)
1716    ASCII                0x00 .. 0x7F
1717    Big5 (1st byte)      0xA1 .. 0xFE
1718         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1719    --------------------------
1720
1721    Since the number of characters in Big5 is larger than maximum
1722    characters in Emacs' charset (96x96), it can't be handled as one
1723    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1724    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1725    contains frequently used characters and the latter contains less
1726    frequently used characters.  */
1727
1728 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1729    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1730    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1731    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1732
1733 /* Number of Big5 characters which have the same code in 1st byte.  */
1734 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1735
1736 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1737   do {                                                                  \
1738     unsigned int temp                                                   \
1739       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1740     if (b1 < 0xC9)                                                      \
1741       charset = charset_big5_1;                                         \
1742     else                                                                \
1743       {                                                                 \
1744         charset = charset_big5_2;                                       \
1745         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1746       }                                                                 \
1747     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1748     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1749   } while (0)
1750
1751 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1752   do {                                                                  \
1753     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1754     if (charset == charset_big5_2)                                      \
1755       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1756     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1757     b2 = temp % BIG5_SAME_ROW;                                          \
1758     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1759   } while (0)
1760
1761 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1762   do {                                                                  \
1763     int c_alt, charset_alt = (charset);                                 \
1764     if (!NILP (unification_table)                                       \
1765         && ((c_alt = unify_char (unification_table,                     \
1766                                  -1, (charset), c1, c2)) >= 0))         \
1767           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1768     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1769       DECODE_CHARACTER_ASCII (c1);                                      \
1770     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1771       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1772     else                                                                \
1773       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1774   } while (0)
1775
1776 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1777   do {                                                                    \
1778     int c_alt, charset_alt;                                               \
1779     if (!NILP (unification_table)                                         \
1780         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1781             >= 0))                                                        \
1782       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1783     else                                                                  \
1784       charset_alt = charset;                                              \
1785     if (charset_alt == charset_ascii)                                     \
1786       *dst++ = c1;                                                        \
1787     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1788       {                                                                   \
1789         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1790           *dst++ = c1;                                                    \
1791         else                                                              \
1792           *dst++ = charset_alt, *dst++ = c1;                              \
1793       }                                                                   \
1794     else                                                                  \
1795       {                                                                   \
1796         c1 &= 0x7F, c2 &= 0x7F;                                           \
1797         if (sjis_p && charset_alt == charset_jisx0208)                    \
1798           {                                                               \
1799             unsigned char s1, s2;                                         \
1800                                                                           \
1801             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1802             *dst++ = s1, *dst++ = s2;                                     \
1803           }                                                               \
1804         else if (!sjis_p                                                  \
1805                  && (charset_alt == charset_big5_1                        \
1806                      || charset_alt == charset_big5_2))                   \
1807           {                                                               \
1808             unsigned char b1, b2;                                         \
1809                                                                           \
1810             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1811             *dst++ = b1, *dst++ = b2;                                     \
1812           }                                                               \
1813         else                                                              \
1814           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1815       }                                                                   \
1816   } while (0);
1817
1818 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1819    Check if a text is encoded in SJIS.  If it is, return
1820    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1821
1822 int
1823 detect_coding_sjis (src, src_end)
1824      unsigned char *src, *src_end;
1825 {
1826   unsigned char c;
1827
1828   while (src < src_end)
1829     {
1830       c = *src++;
1831       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1832         return 0;
1833       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1834         {
1835           if (src < src_end && *src++ < 0x40)
1836             return 0;
1837         }
1838     }
1839   return CODING_CATEGORY_MASK_SJIS;
1840 }
1841
1842 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1843    Check if a text is encoded in BIG5.  If it is, return
1844    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1845
1846 int
1847 detect_coding_big5 (src, src_end)
1848      unsigned char *src, *src_end;
1849 {
1850   unsigned char c;
1851
1852   while (src < src_end)
1853     {
1854       c = *src++;
1855       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1856         return 0;
1857       if (c >= 0xA1)
1858         {
1859           if (src >= src_end)
1860             break;
1861           c = *src++;
1862           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1863             return 0;
1864         }
1865     }
1866   return CODING_CATEGORY_MASK_BIG5;
1867 }
1868
1869 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1870    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1871
1872 int
1873 decode_coding_sjis_big5 (coding, source, destination,
1874                          src_bytes, dst_bytes, consumed, sjis_p)
1875      struct coding_system *coding;
1876      unsigned char *source, *destination;
1877      int src_bytes, dst_bytes;
1878      int *consumed;
1879      int sjis_p;
1880 {
1881   unsigned char *src = source;
1882   unsigned char *src_end = source + src_bytes;
1883   unsigned char *dst = destination;
1884   unsigned char *dst_end = destination + dst_bytes;
1885   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1886      from DST_END to assure overflow checking is necessary only at the
1887      head of loop.  */
1888   unsigned char *adjusted_dst_end = dst_end - 3;
1889   Lisp_Object unification_table
1890       = coding->character_unification_table_for_decode;
1891
1892   if (!NILP (Venable_character_unification) && NILP (unification_table))
1893     unification_table = Vstandard_character_unification_table_for_decode;
1894
1895   while (src < src_end && dst < adjusted_dst_end)
1896     {
1897       /* SRC_BASE remembers the start position in source in each loop.
1898          The loop will be exited when there's not enough source text
1899          to analyze two-byte character (within macro ONE_MORE_BYTE).
1900          In that case, SRC is reset to SRC_BASE before exiting.  */
1901       unsigned char *src_base = src;
1902       unsigned char c1 = *src++, c2, c3, c4;
1903
1904       if (c1 == '\r')
1905         {
1906           if (coding->eol_type == CODING_EOL_CRLF)
1907             {
1908               ONE_MORE_BYTE (c2);
1909               if (c2 == '\n')
1910                 *dst++ = c2;
1911               else
1912                 /* To process C2 again, SRC is subtracted by 1.  */
1913                 *dst++ = c1, src--;
1914             }
1915           else
1916             *dst++ = c1;
1917         }
1918       else if (c1 < 0x20)
1919         *dst++ = c1;
1920       else if (c1 < 0x80)
1921         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1922       else if (c1 < 0xA0 || c1 >= 0xE0)
1923         {
1924           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1925           if (sjis_p)
1926             {
1927               ONE_MORE_BYTE (c2);
1928               DECODE_SJIS (c1, c2, c3, c4);
1929               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1930             }
1931           else if (c1 >= 0xE0 && c1 < 0xFF)
1932             {
1933               int charset;
1934
1935               ONE_MORE_BYTE (c2);
1936               DECODE_BIG5 (c1, c2, charset, c3, c4);
1937               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1938             }
1939           else                  /* Invalid code */
1940             *dst++ = c1;
1941         }
1942       else
1943         {
1944           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1945           if (sjis_p)
1946             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1947           else
1948             {
1949               int charset;
1950
1951               ONE_MORE_BYTE (c2);
1952               DECODE_BIG5 (c1, c2, charset, c3, c4);
1953               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1954             }
1955         }
1956       continue;
1957
1958     label_end_of_loop:
1959       coding->carryover_size = src - src_base;
1960       bcopy (src_base, coding->carryover, coding->carryover_size);
1961       src = src_base;
1962       break;
1963     }
1964
1965   *consumed = src - source;
1966   return dst - destination;
1967 }
1968
1969 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1970    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1971    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1972    sure that all these charsets are registered as official charset
1973    (i.e. do not have extended leading-codes).  Characters of other
1974    charsets are produced without any encoding.  If SJIS_P is 1, encode
1975    SJIS text, else encode BIG5 text.  */
1976
1977 int
1978 encode_coding_sjis_big5 (coding, source, destination,
1979                          src_bytes, dst_bytes, consumed, sjis_p)
1980      struct coding_system *coding;
1981      unsigned char *source, *destination;
1982      int src_bytes, dst_bytes;
1983      int *consumed;
1984      int sjis_p;
1985 {
1986   unsigned char *src = source;
1987   unsigned char *src_end = source + src_bytes;
1988   unsigned char *dst = destination;
1989   unsigned char *dst_end = destination + dst_bytes;
1990   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1991      from DST_END to assure overflow checking is necessary only at the
1992      head of loop.  */
1993   unsigned char *adjusted_dst_end = dst_end - 1;
1994   Lisp_Object unification_table
1995       = coding->character_unification_table_for_encode;
1996
1997   if (!NILP (Venable_character_unification) && NILP (unification_table))
1998     unification_table = Vstandard_character_unification_table_for_encode;
1999
2000   while (src < src_end && dst < adjusted_dst_end)
2001     {
2002       /* SRC_BASE remembers the start position in source in each loop.
2003          The loop will be exited when there's not enough source text
2004          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2005          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2006          before exiting.  */
2007       unsigned char *src_base = src;
2008       unsigned char c1 = *src++, c2, c3, c4;
2009
2010       if (coding->composing)
2011         {
2012           if (c1 == 0xA0)
2013             {
2014               ONE_MORE_BYTE (c1);
2015               c1 &= 0x7F;
2016             }
2017           else if (c1 >= 0xA0)
2018             c1 -= 0x20;
2019           else
2020             coding->composing = 0;
2021         }
2022
2023       switch (emacs_code_class[c1])
2024         {
2025         case EMACS_ascii_code:
2026           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2027           break;
2028
2029         case EMACS_control_code:
2030           *dst++ = c1;
2031           break;
2032
2033         case EMACS_carriage_return_code:
2034           if (!coding->selective)
2035             {
2036               *dst++ = c1;
2037               break;
2038             }
2039           /* fall down to treat '\r' as '\n' ...  */
2040
2041         case EMACS_linefeed_code:
2042           if (coding->eol_type == CODING_EOL_LF
2043               || coding->eol_type == CODING_EOL_UNDECIDED)
2044             *dst++ = '\n';
2045           else if (coding->eol_type == CODING_EOL_CRLF)
2046             *dst++ = '\r', *dst++ = '\n';
2047           else
2048             *dst++ = '\r';
2049           break;
2050
2051         case EMACS_leading_code_2:
2052           ONE_MORE_BYTE (c2);
2053           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2054           break;
2055
2056         case EMACS_leading_code_3:
2057           TWO_MORE_BYTES (c2, c3);
2058           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2059           break;
2060
2061         case EMACS_leading_code_4:
2062           THREE_MORE_BYTES (c2, c3, c4);
2063           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2064           break;
2065
2066         case EMACS_leading_code_composition:
2067           coding->composing = 1;
2068           break;
2069
2070         default:                /* i.e. case EMACS_invalid_code: */
2071           *dst++ = c1;
2072         }
2073       continue;
2074
2075     label_end_of_loop:
2076       coding->carryover_size = src_end - src_base;
2077       bcopy (src_base, coding->carryover, coding->carryover_size);
2078       src = src_end;
2079       break;
2080     }
2081
2082   *consumed = src - source;
2083   return dst - destination;
2084 }
2085
2086 \f
2087 /*** 5. End-of-line handlers ***/
2088
2089 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2090    This function is called only when `coding->eol_type' is
2091    CODING_EOL_CRLF or CODING_EOL_CR.  */
2092
2093 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2094      struct coding_system *coding;
2095      unsigned char *source, *destination;
2096      int src_bytes, dst_bytes;
2097      int *consumed;
2098 {
2099   unsigned char *src = source;
2100   unsigned char *src_end = source + src_bytes;
2101   unsigned char *dst = destination;
2102   unsigned char *dst_end = destination + dst_bytes;
2103   int produced;
2104
2105   switch (coding->eol_type)
2106     {
2107     case CODING_EOL_CRLF:
2108       {
2109         /* Since the maximum bytes produced by each loop is 2, we
2110            subtract 1 from DST_END to assure overflow checking is
2111            necessary only at the head of loop.  */
2112         unsigned char *adjusted_dst_end = dst_end - 1;
2113
2114         while (src < src_end && dst < adjusted_dst_end)
2115           {
2116             unsigned char *src_base = src;
2117             unsigned char c = *src++;
2118             if (c == '\r')
2119               {
2120                 ONE_MORE_BYTE (c);
2121                 if (c != '\n')
2122                   *dst++ = '\r';
2123                 *dst++ = c;
2124               }
2125             else
2126               *dst++ = c;
2127             continue;
2128
2129           label_end_of_loop:
2130             coding->carryover_size = src - src_base;
2131             bcopy (src_base, coding->carryover, coding->carryover_size);
2132             src = src_base;
2133             break;
2134           }
2135         *consumed = src - source;
2136         produced = dst - destination;
2137         break;
2138       }
2139
2140     case CODING_EOL_CR:
2141       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2142       bcopy (source, destination, produced);
2143       dst_end = destination + produced;
2144       while (dst < dst_end)
2145         if (*dst++ == '\r') dst[-1] = '\n';
2146       *consumed = produced;
2147       break;
2148
2149     default:                    /* i.e. case: CODING_EOL_LF */
2150       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2151       bcopy (source, destination, produced);
2152       *consumed = produced;
2153       break;
2154     }
2155
2156   return produced;
2157 }
2158
2159 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2160    format of end-of-line according to `coding->eol_type'.  If
2161    `coding->selective' is 1, code '\r' in source text also means
2162    end-of-line.  */
2163
2164 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2165      struct coding_system *coding;
2166      unsigned char *source, *destination;
2167      int src_bytes, dst_bytes;
2168      int *consumed;
2169 {
2170   unsigned char *src = source;
2171   unsigned char *dst = destination;
2172   int produced;
2173
2174   if (src_bytes <= 0)
2175     return 0;
2176
2177   switch (coding->eol_type)
2178     {
2179     case CODING_EOL_LF:
2180     case CODING_EOL_UNDECIDED:
2181       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2182       bcopy (source, destination, produced);
2183       if (coding->selective)
2184         {
2185           int i = produced;
2186           while (i--)
2187             if (*dst++ == '\r') dst[-1] = '\n';
2188         }
2189       *consumed = produced;
2190
2191     case CODING_EOL_CRLF:
2192       {
2193         unsigned char c;
2194         unsigned char *src_end = source + src_bytes;
2195         unsigned char *dst_end = destination + dst_bytes;
2196         /* Since the maximum bytes produced by each loop is 2, we
2197            subtract 1 from DST_END to assure overflow checking is
2198            necessary only at the head of loop.  */
2199         unsigned char *adjusted_dst_end = dst_end - 1;
2200
2201         while (src < src_end && dst < adjusted_dst_end)
2202           {
2203             c = *src++;
2204             if (c == '\n' || (c == '\r' && coding->selective))
2205               *dst++ = '\r', *dst++ = '\n';
2206             else
2207               *dst++ = c;
2208           }
2209         produced = dst - destination;
2210         *consumed = src - source;
2211         break;
2212       }
2213
2214     default:                    /* i.e. case CODING_EOL_CR: */
2215       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2216       bcopy (source, destination, produced);
2217       {
2218         int i = produced;
2219         while (i--)
2220           if (*dst++ == '\n') dst[-1] = '\r';
2221       }
2222       *consumed = produced;
2223     }
2224
2225   return produced;
2226 }
2227
2228 \f
2229 /*** 6. C library functions ***/
2230
2231 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2232    has a property `coding-system'.  The value of this property is a
2233    vector of length 5 (called as coding-vector).  Among elements of
2234    this vector, the first (element[0]) and the fifth (element[4])
2235    carry important information for decoding/encoding.  Before
2236    decoding/encoding, this information should be set in fields of a
2237    structure of type `coding_system'.
2238
2239    A value of property `coding-system' can be a symbol of another
2240    subsidiary coding-system.  In that case, Emacs gets coding-vector
2241    from that symbol.
2242
2243    `element[0]' contains information to be set in `coding->type'.  The
2244    value and its meaning is as follows:
2245
2246    0 -- coding_type_emacs_mule
2247    1 -- coding_type_sjis
2248    2 -- coding_type_iso2022
2249    3 -- coding_type_big5
2250    4 -- coding_type_ccl encoder/decoder written in CCL
2251    nil -- coding_type_no_conversion
2252    t -- coding_type_undecided (automatic conversion on decoding,
2253                                no-conversion on encoding)
2254
2255    `element[4]' contains information to be set in `coding->flags' and
2256    `coding->spec'.  The meaning varies by `coding->type'.
2257
2258    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2259    of length 32 (of which the first 13 sub-elements are used now).
2260    Meanings of these sub-elements are:
2261
2262    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2263         If the value is an integer of valid charset, the charset is
2264         assumed to be designated to graphic register N initially.
2265
2266         If the value is minus, it is a minus value of charset which
2267         reserves graphic register N, which means that the charset is
2268         not designated initially but should be designated to graphic
2269         register N just before encoding a character in that charset.
2270
2271         If the value is nil, graphic register N is never used on
2272         encoding.
2273
2274    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2275         Each value takes t or nil.  See the section ISO2022 of
2276         `coding.h' for more information.
2277
2278    If `coding->type' is `coding_type_big5', element[4] is t to denote
2279    BIG5-ETen or nil to denote BIG5-HKU.
2280
2281    If `coding->type' takes the other value, element[4] is ignored.
2282
2283    Emacs Lisp's coding system also carries information about format of
2284    end-of-line in a value of property `eol-type'.  If the value is
2285    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2286    means CODING_EOL_CR.  If it is not integer, it should be a vector
2287    of subsidiary coding systems of which property `eol-type' has one
2288    of above values.
2289
2290 */
2291
2292 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2293    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2294    is setup so that no conversion is necessary and return -1, else
2295    return 0.  */
2296
2297 int
2298 setup_coding_system (coding_system, coding)
2299      Lisp_Object coding_system;
2300      struct coding_system *coding;
2301 {
2302   Lisp_Object type, eol_type;
2303
2304   /* At first, set several fields to default values.  */
2305   coding->require_flushing = 0;
2306   coding->last_block = 0;
2307   coding->selective = 0;
2308   coding->composing = 0;
2309   coding->direction = 0;
2310   coding->carryover_size = 0;
2311   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2312   coding->character_unification_table_for_decode = Qnil;
2313   coding->character_unification_table_for_encode = Qnil;
2314
2315   Vlast_coding_system_used = coding->symbol = coding_system;
2316   eol_type = Qnil;
2317   /* Get value of property `coding-system' until we get a vector.
2318      While doing that, also get values of properties
2319      `post-read-conversion', `pre-write-conversion',
2320      `character-unification-table-for-decode',
2321      `character-unification-table-for-encode' and `eol-type'.  */
2322   while (!NILP (coding_system) && SYMBOLP (coding_system))
2323     {
2324       if (NILP (coding->post_read_conversion))
2325         coding->post_read_conversion = Fget (coding_system,
2326                                              Qpost_read_conversion);
2327       if (NILP (coding->pre_write_conversion))
2328         coding->pre_write_conversion = Fget (coding_system,
2329                                              Qpre_write_conversion);
2330       if (!inhibit_eol_conversion && NILP (eol_type))
2331         eol_type = Fget (coding_system, Qeol_type);
2332
2333       if (NILP (coding->character_unification_table_for_decode))
2334         coding->character_unification_table_for_decode
2335           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2336
2337       if (NILP (coding->character_unification_table_for_encode))
2338         coding->character_unification_table_for_encode
2339           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2340
2341       coding_system = Fget (coding_system, Qcoding_system);
2342     }
2343
2344   while (!NILP (coding->character_unification_table_for_decode)
2345          && SYMBOLP (coding->character_unification_table_for_decode))
2346         coding->character_unification_table_for_decode
2347           = Fget (coding->character_unification_table_for_decode,
2348                   Qcharacter_unification_table_for_decode);
2349   if (!NILP (coding->character_unification_table_for_decode)
2350       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2351       coding->character_unification_table_for_decode = Qnil;
2352
2353   while (!NILP (coding->character_unification_table_for_encode)
2354          && SYMBOLP (coding->character_unification_table_for_encode))
2355         coding->character_unification_table_for_encode
2356           = Fget (coding->character_unification_table_for_encode,
2357                   Qcharacter_unification_table_for_encode);
2358   if (!NILP (coding->character_unification_table_for_encode)
2359       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2360       coding->character_unification_table_for_encode = Qnil;
2361
2362   if (!VECTORP (coding_system)
2363       || XVECTOR (coding_system)->size != 5)
2364     goto label_invalid_coding_system;
2365
2366   if (VECTORP (eol_type))
2367     coding->eol_type = CODING_EOL_UNDECIDED;
2368   else if (XFASTINT (eol_type) == 1)
2369     coding->eol_type = CODING_EOL_CRLF;
2370   else if (XFASTINT (eol_type) == 2)
2371     coding->eol_type = CODING_EOL_CR;
2372   else
2373     coding->eol_type = CODING_EOL_LF;
2374
2375   type = XVECTOR (coding_system)->contents[0];
2376   switch (XFASTINT (type))
2377     {
2378     case 0:
2379       coding->type = coding_type_emacs_mule;
2380       break;
2381
2382     case 1:
2383       coding->type = coding_type_sjis;
2384       break;
2385
2386     case 2:
2387       coding->type = coding_type_iso2022;
2388       {
2389         Lisp_Object val;
2390         Lisp_Object *flags;
2391         int i, charset, default_reg_bits = 0;
2392
2393         val = XVECTOR (coding_system)->contents[4];
2394
2395         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2396           goto label_invalid_coding_system;
2397
2398         flags = XVECTOR (val)->contents;
2399         coding->flags
2400           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2401              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2402              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2403              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2404              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2405              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2406              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2407              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2408              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2409              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2410              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2411              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2412              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2413              );
2414
2415         /* Invoke graphic register 0 to plane 0.  */
2416         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2417         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2418         CODING_SPEC_ISO_INVOCATION (coding, 1)
2419           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2420         /* Not single shifting at first.  */
2421         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2422         /* Beginning of buffer should also be regarded as bol. */
2423         CODING_SPEC_ISO_BOL (coding) = 1;
2424
2425         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2426            FLAGS[REG] can be one of below:
2427                 integer CHARSET: CHARSET occupies register I,
2428                 t: designate nothing to REG initially, but can be used
2429                   by any charsets,
2430                 list of integer, nil, or t: designate the first
2431                   element (if integer) to REG initially, the remaining
2432                   elements (if integer) is designated to REG on request,
2433                   if an element is t, REG can be used by any charset,
2434                 nil: REG is never used.  */
2435         for (charset = 0; charset <= MAX_CHARSET; charset++)
2436           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2437             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2438         bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
2439         for (i = 0; i < 4; i++)
2440           {
2441             if (INTEGERP (flags[i])
2442                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2443                 || (charset = get_charset_id (flags[i])) >= 0)
2444               {
2445                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2446                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2447                 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2448               }
2449             else if (EQ (flags[i], Qt))
2450               {
2451                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2452                 default_reg_bits |= 1 << i;
2453               }
2454             else if (CONSP (flags[i]))
2455               {
2456                 Lisp_Object tail = flags[i];
2457
2458                 if (INTEGERP (XCONS (tail)->car)
2459                     && (charset = XINT (XCONS (tail)->car),
2460                         CHARSET_VALID_P (charset))
2461                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2462                   {
2463                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2464                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2465                     CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2466                   }
2467                 else
2468                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2469                 tail = XCONS (tail)->cdr;
2470                 while (CONSP (tail))
2471                   {
2472                     if (INTEGERP (XCONS (tail)->car)
2473                         && (charset = XINT (XCONS (tail)->car),
2474                             CHARSET_VALID_P (charset))
2475                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2476                       {
2477                         CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2478                           = i;
2479                         CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2480                           = 1;
2481                       }
2482                     else if (EQ (XCONS (tail)->car, Qt))
2483                       default_reg_bits |= 1 << i;
2484                     tail = XCONS (tail)->cdr;
2485                   }
2486               }
2487             else
2488               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2489
2490             CODING_SPEC_ISO_DESIGNATION (coding, i)
2491               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2492           }
2493
2494         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2495           {
2496             /* REG 1 can be used only by locking shift in 7-bit env.  */
2497             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2498               default_reg_bits &= ~2;
2499             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2500               /* Without any shifting, only REG 0 and 1 can be used.  */
2501               default_reg_bits &= 3;
2502           }
2503
2504         for (charset = 0; charset <= MAX_CHARSET; charset++)
2505           if (CHARSET_VALID_P (charset)
2506               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2507                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2508             {
2509               /* We have not yet decided where to designate CHARSET.  */
2510               int reg_bits = default_reg_bits;
2511
2512               if (CHARSET_CHARS (charset) == 96)
2513                 /* A charset of CHARS96 can't be designated to REG 0.  */
2514                 reg_bits &= ~1;
2515
2516               if (reg_bits)
2517                 /* There exist some default graphic register.  */
2518                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2519                   = (reg_bits & 1
2520                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2521               else
2522                 /* We anyway have to designate CHARSET to somewhere.  */
2523                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2524                   = (CHARSET_CHARS (charset) == 94
2525                      ? 0
2526                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2527                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2528                         ? 1
2529                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2530                            ? 2 : 0)));
2531             }
2532       }
2533       coding->require_flushing = 1;
2534       break;
2535
2536     case 3:
2537       coding->type = coding_type_big5;
2538       coding->flags
2539         = (NILP (XVECTOR (coding_system)->contents[4])
2540            ? CODING_FLAG_BIG5_HKU
2541            : CODING_FLAG_BIG5_ETEN);
2542       break;
2543
2544     case 4:
2545       coding->type = coding_type_ccl;
2546       {
2547         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2548         if (CONSP  (val)
2549             && VECTORP (XCONS (val)->car)
2550             && VECTORP (XCONS (val)->cdr))
2551           {
2552             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2553             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2554           }
2555         else
2556           goto label_invalid_coding_system;
2557       }
2558       coding->require_flushing = 1;
2559       break;
2560
2561     case 5:
2562       coding->type = coding_type_raw_text;
2563       break;
2564
2565     default:
2566       if (EQ (type, Qt))
2567         coding->type = coding_type_undecided;
2568       else
2569         coding->type = coding_type_no_conversion;
2570       break;
2571     }
2572   return 0;
2573
2574  label_invalid_coding_system:
2575   coding->type = coding_type_no_conversion;
2576   coding->eol_type = CODING_EOL_LF;
2577   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2578     = Qnil;
2579   return -1;
2580 }
2581
2582 /* Emacs has a mechanism to automatically detect a coding system if it
2583    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2584    it's impossible to distinguish some coding systems accurately
2585    because they use the same range of codes.  So, at first, coding
2586    systems are categorized into 7, those are:
2587
2588    o coding-category-emacs-mule
2589
2590         The category for a coding system which has the same code range
2591         as Emacs' internal format.  Assigned the coding-system (Lisp
2592         symbol) `emacs-mule' by default.
2593
2594    o coding-category-sjis
2595
2596         The category for a coding system which has the same code range
2597         as SJIS.  Assigned the coding-system (Lisp
2598         symbol) `japanese-shift-jis' by default.
2599
2600    o coding-category-iso-7
2601
2602         The category for a coding system which has the same code range
2603         as ISO2022 of 7-bit environment.  This doesn't use any locking
2604         shift and single shift functions.  Assigned the coding-system
2605         (Lisp symbol) `iso-2022-7bit' by default.
2606
2607    o coding-category-iso-8-1
2608
2609         The category for a coding system which has the same code range
2610         as ISO2022 of 8-bit environment and graphic plane 1 used only
2611         for DIMENSION1 charset.  This doesn't use any locking shift
2612         and single shift functions.  Assigned the coding-system (Lisp
2613         symbol) `iso-latin-1' by default.
2614
2615    o coding-category-iso-8-2
2616
2617         The category for a coding system which has the same code range
2618         as ISO2022 of 8-bit environment and graphic plane 1 used only
2619         for DIMENSION2 charset.  This doesn't use any locking shift
2620         and single shift functions.  Assigned the coding-system (Lisp
2621         symbol) `japanese-iso-8bit' by default.
2622
2623    o coding-category-iso-7-else
2624
2625         The category for a coding system which has the same code range
2626         as ISO2022 of 7-bit environemnt but uses locking shift or
2627         single shift functions.  Assigned the coding-system (Lisp
2628         symbol) `iso-2022-7bit-lock' by default.
2629
2630    o coding-category-iso-8-else
2631
2632         The category for a coding system which has the same code range
2633         as ISO2022 of 8-bit environemnt but uses locking shift or
2634         single shift functions.  Assigned the coding-system (Lisp
2635         symbol) `iso-2022-8bit-ss2' by default.
2636
2637    o coding-category-big5
2638
2639         The category for a coding system which has the same code range
2640         as BIG5.  Assigned the coding-system (Lisp symbol)
2641         `cn-big5' by default.
2642
2643    o coding-category-binary
2644
2645         The category for a coding system not categorized in any of the
2646         above.  Assigned the coding-system (Lisp symbol)
2647         `no-conversion' by default.
2648
2649    Each of them is a Lisp symbol and the value is an actual
2650    `coding-system's (this is also a Lisp symbol) assigned by a user.
2651    What Emacs does actually is to detect a category of coding system.
2652    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2653    decide only one possible category, it selects a category of the
2654    highest priority.  Priorities of categories are also specified by a
2655    user in a Lisp variable `coding-category-list'.
2656
2657 */
2658
2659 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2660    If it detects possible coding systems, return an integer in which
2661    appropriate flag bits are set.  Flag bits are defined by macros
2662    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2663
2664 int
2665 detect_coding_mask (src, src_bytes)
2666      unsigned char *src;
2667      int src_bytes;
2668 {
2669   register unsigned char c;
2670   unsigned char *src_end = src + src_bytes;
2671   int mask;
2672
2673   /* At first, skip all ASCII characters and control characters except
2674      for three ISO2022 specific control characters.  */
2675  label_loop_detect_coding:
2676   while (src < src_end)
2677     {
2678       c = *src;
2679       if (c >= 0x80
2680           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2681         break;
2682       src++;
2683     }
2684
2685   if (src >= src_end)
2686     /* We found nothing other than ASCII.  There's nothing to do.  */
2687     return CODING_CATEGORY_MASK_ANY;
2688
2689   /* The text seems to be encoded in some multilingual coding system.
2690      Now, try to find in which coding system the text is encoded.  */
2691   if (c < 0x80)
2692     {
2693       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2694       /* C is an ISO2022 specific control code of C0.  */
2695       mask = detect_coding_iso2022 (src, src_end);
2696       src++;
2697       if (mask == 0)
2698         /* No valid ISO2022 code follows C.  Try again.  */
2699         goto label_loop_detect_coding;
2700       mask |= CODING_CATEGORY_MASK_RAW_TEXT;
2701     }
2702   else if (c < 0xA0)
2703     {
2704       /* If C is a special latin extra code,
2705          or is an ISO2022 specific control code of C1 (SS2 or SS3),
2706          or is an ISO2022 control-sequence-introducer (CSI),
2707          we should also consider the possibility of ISO2022 codings.  */
2708       if ((VECTORP (Vlatin_extra_code_table)
2709            && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2710           || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2711           || (c == ISO_CODE_CSI
2712               && (src < src_end
2713                   && (*src == ']'
2714                       || (src + 1 < src_end
2715                           && src[1] == ']'
2716                           && (*src == '0' || *src == '1' || *src == '2'))))))
2717         mask = (detect_coding_iso2022 (src, src_end)
2718                 | detect_coding_sjis (src, src_end)
2719                 | detect_coding_emacs_mule (src, src_end)
2720                 | CODING_CATEGORY_MASK_RAW_TEXT);
2721
2722       else
2723         /* C is the first byte of SJIS character code,
2724            or a leading-code of Emacs' internal format (emacs-mule).  */
2725         mask = (detect_coding_sjis (src, src_end)
2726                 | detect_coding_emacs_mule (src, src_end)
2727                 | CODING_CATEGORY_MASK_RAW_TEXT);
2728     }
2729   else
2730     /* C is a character of ISO2022 in graphic plane right,
2731        or a SJIS's 1-byte character code (i.e. JISX0201),
2732        or the first byte of BIG5's 2-byte code.  */
2733     mask = (detect_coding_iso2022 (src, src_end)
2734             | detect_coding_sjis (src, src_end)
2735             | detect_coding_big5 (src, src_end)
2736             | CODING_CATEGORY_MASK_RAW_TEXT);
2737
2738   return mask;
2739 }
2740
2741 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2742    The information of the detected coding system is set in CODING.  */
2743
2744 void
2745 detect_coding (coding, src, src_bytes)
2746      struct coding_system *coding;
2747      unsigned char *src;
2748      int src_bytes;
2749 {
2750   int mask = detect_coding_mask (src, src_bytes);
2751   int idx;
2752   Lisp_Object val = Vcoding_category_list;
2753
2754   if (mask == CODING_CATEGORY_MASK_ANY)
2755     /* We found nothing other than ASCII.  There's nothing to do.  */
2756     return;
2757
2758   /* We found some plausible coding systems.  Let's use a coding
2759      system of the highest priority.  */
2760
2761   if (CONSP (val))
2762     while (!NILP (val))
2763       {
2764         idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2765         if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2766           break;
2767         val = XCONS (val)->cdr;
2768       }
2769   else
2770     val = Qnil;
2771
2772   if (NILP (val))
2773     {
2774       /* For unknown reason, `Vcoding_category_list' contains none of
2775          found categories.  Let's use any of them.  */
2776       for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2777         if (mask & (1 << idx))
2778           break;
2779     }
2780   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2781 }
2782
2783 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2784    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2785    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2786
2787 #define MAX_EOL_CHECK_COUNT 3
2788
2789 int
2790 detect_eol_type (src, src_bytes)
2791      unsigned char *src;
2792      int src_bytes;
2793 {
2794   unsigned char *src_end = src + src_bytes;
2795   unsigned char c;
2796   int total = 0;                /* How many end-of-lines are found so far.  */
2797   int eol_type = CODING_EOL_UNDECIDED;
2798   int this_eol_type;
2799
2800   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2801     {
2802       c = *src++;
2803       if (c == '\n' || c == '\r')
2804         {
2805           total++;
2806           if (c == '\n')
2807             this_eol_type = CODING_EOL_LF;
2808           else if (src >= src_end || *src != '\n')
2809             this_eol_type = CODING_EOL_CR;
2810           else
2811             this_eol_type = CODING_EOL_CRLF, src++;
2812
2813           if (eol_type == CODING_EOL_UNDECIDED)
2814             /* This is the first end-of-line.  */
2815             eol_type = this_eol_type;
2816           else if (eol_type != this_eol_type)
2817             /* The found type is different from what found before.
2818                Let's notice the caller about this inconsistency.  */
2819             return CODING_EOL_INCONSISTENT;
2820         }
2821     }
2822
2823   return eol_type;
2824 }
2825
2826 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2827    is encoded.  If it detects an appropriate format of end-of-line, it
2828    sets the information in *CODING.  */
2829
2830 void
2831 detect_eol (coding, src, src_bytes)
2832      struct coding_system *coding;
2833      unsigned char *src;
2834      int src_bytes;
2835 {
2836   Lisp_Object val, coding_system;
2837   int eol_type = detect_eol_type (src, src_bytes);
2838
2839   if (eol_type == CODING_EOL_UNDECIDED)
2840     /*  We found no end-of-line in the source text.  */
2841     return;
2842
2843   if (eol_type == CODING_EOL_INCONSISTENT)
2844     {
2845 #if 0
2846       /* This code is suppressed until we find a better way to
2847          distinguish raw text file and binary file.  */
2848
2849       /* If we have already detected that the coding is raw-text, the
2850          coding should actually be no-conversion.  */
2851       if (coding->type == coding_type_raw_text)
2852         {
2853           setup_coding_system (Qno_conversion, coding);
2854           return;
2855         }
2856       /* Else, let's decode only text code anyway.  */
2857 #endif /* 0 */
2858       eol_type = CODING_EOL_LF;
2859     }
2860
2861   coding_system = coding->symbol;
2862   while (!NILP (coding_system)
2863          && NILP (val = Fget (coding_system, Qeol_type)))
2864     coding_system = Fget (coding_system, Qcoding_system);
2865   if (VECTORP (val) && XVECTOR (val)->size == 3)
2866     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2867 }
2868
2869 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2870    decoding, it may detect coding system and format of end-of-line if
2871    those are not yet decided.  */
2872
2873 int
2874 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2875      struct coding_system *coding;
2876      unsigned char *source, *destination;
2877      int src_bytes, dst_bytes;
2878      int *consumed;
2879 {
2880   int produced;
2881
2882   if (src_bytes <= 0)
2883     {
2884       *consumed = 0;
2885       return 0;
2886     }
2887
2888   if (coding->type == coding_type_undecided)
2889     detect_coding (coding, source, src_bytes);
2890
2891   if (coding->eol_type == CODING_EOL_UNDECIDED)
2892     detect_eol (coding, source, src_bytes);
2893
2894   coding->carryover_size = 0;
2895   switch (coding->type)
2896     {
2897     case coding_type_no_conversion:
2898     label_no_conversion:
2899       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2900       bcopy (source, destination, produced);
2901       *consumed = produced;
2902       break;
2903
2904     case coding_type_emacs_mule:
2905     case coding_type_undecided:
2906     case coding_type_raw_text:
2907       if (coding->eol_type == CODING_EOL_LF
2908           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2909         goto label_no_conversion;
2910       produced = decode_eol (coding, source, destination,
2911                              src_bytes, dst_bytes, consumed);
2912       break;
2913
2914     case coding_type_sjis:
2915       produced = decode_coding_sjis_big5 (coding, source, destination,
2916                                           src_bytes, dst_bytes, consumed,
2917                                           1);
2918       break;
2919
2920     case coding_type_iso2022:
2921       produced = decode_coding_iso2022 (coding, source, destination,
2922                                         src_bytes, dst_bytes, consumed);
2923       break;
2924
2925     case coding_type_big5:
2926       produced = decode_coding_sjis_big5 (coding, source, destination,
2927                                           src_bytes, dst_bytes, consumed,
2928                                           0);
2929       break;
2930
2931     case coding_type_ccl:
2932       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2933                              src_bytes, dst_bytes, consumed);
2934       break;
2935     }
2936
2937   return produced;
2938 }
2939
2940 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2941
2942 int
2943 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2944      struct coding_system *coding;
2945      unsigned char *source, *destination;
2946      int src_bytes, dst_bytes;
2947      int *consumed;
2948 {
2949   int produced;
2950
2951   switch (coding->type)
2952     {
2953     case coding_type_no_conversion:
2954     label_no_conversion:
2955       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2956       if (produced > 0)
2957         {
2958           bcopy (source, destination, produced);
2959           if (coding->selective)
2960             {
2961               unsigned char *p = destination, *pend = destination + produced;
2962               while (p < pend)
2963                 if (*p++ == '\015') p[-1] = '\n';
2964             }
2965         }
2966       *consumed = produced;
2967       break;
2968
2969     case coding_type_emacs_mule:
2970     case coding_type_undecided:
2971     case coding_type_raw_text:
2972       if (coding->eol_type == CODING_EOL_LF
2973           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2974         goto label_no_conversion;
2975       produced = encode_eol (coding, source, destination,
2976                              src_bytes, dst_bytes, consumed);
2977       break;
2978
2979     case coding_type_sjis:
2980       produced = encode_coding_sjis_big5 (coding, source, destination,
2981                                           src_bytes, dst_bytes, consumed,
2982                                           1);
2983       break;
2984
2985     case coding_type_iso2022:
2986       produced = encode_coding_iso2022 (coding, source, destination,
2987                                         src_bytes, dst_bytes, consumed);
2988       break;
2989
2990     case coding_type_big5:
2991       produced = encode_coding_sjis_big5 (coding, source, destination,
2992                                           src_bytes, dst_bytes, consumed,
2993                                           0);
2994       break;
2995
2996     case coding_type_ccl:
2997       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2998                              src_bytes, dst_bytes, consumed);
2999       break;
3000     }
3001
3002   return produced;
3003 }
3004
3005 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3006
3007 /* Return maximum size (bytes) of a buffer enough for decoding
3008    SRC_BYTES of text encoded in CODING.  */
3009
3010 int
3011 decoding_buffer_size (coding, src_bytes)
3012      struct coding_system *coding;
3013      int src_bytes;
3014 {
3015   int magnification;
3016
3017   if (coding->type == coding_type_iso2022)
3018     magnification = 3;
3019   else if (coding->type == coding_type_ccl)
3020     magnification = coding->spec.ccl.decoder.buf_magnification;
3021   else
3022     magnification = 2;
3023
3024   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3025 }
3026
3027 /* Return maximum size (bytes) of a buffer enough for encoding
3028    SRC_BYTES of text to CODING.  */
3029
3030 int
3031 encoding_buffer_size (coding, src_bytes)
3032      struct coding_system *coding;
3033      int src_bytes;
3034 {
3035   int magnification;
3036
3037   if (coding->type == coding_type_ccl)
3038     magnification = coding->spec.ccl.encoder.buf_magnification;
3039   else
3040     magnification = 3;
3041
3042   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3043 }
3044
3045 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3046 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3047 #endif
3048
3049 char *conversion_buffer;
3050 int conversion_buffer_size;
3051
3052 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3053    or decoding.  Sufficient memory is allocated automatically.  If we
3054    run out of memory, return NULL.  */
3055
3056 char *
3057 get_conversion_buffer (size)
3058      int size;
3059 {
3060   if (size > conversion_buffer_size)
3061     {
3062       char *buf;
3063       int real_size = conversion_buffer_size * 2;
3064
3065       while (real_size < size) real_size *= 2;
3066       buf = (char *) xmalloc (real_size);
3067       xfree (conversion_buffer);
3068       conversion_buffer = buf;
3069       conversion_buffer_size = real_size;
3070     }
3071   return conversion_buffer;
3072 }
3073
3074 \f
3075 #ifdef emacs
3076 /*** 7. Emacs Lisp library functions ***/
3077
3078 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3079        1, 1, 0,
3080   "Return coding-spec of CODING-SYSTEM.\n\
3081 If CODING-SYSTEM is not a valid coding-system, return nil.")
3082   (obj)
3083      Lisp_Object obj;
3084 {
3085   while (SYMBOLP (obj) && !NILP (obj))
3086     obj = Fget (obj, Qcoding_system);
3087   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3088           ? Qnil : obj);
3089 }
3090
3091 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3092   "Return t if OBJECT is nil or a coding-system.\n\
3093 See document of make-coding-system for coding-system object.")
3094   (obj)
3095      Lisp_Object obj;
3096 {
3097   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3098 }
3099
3100 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3101        Sread_non_nil_coding_system, 1, 1, 0,
3102   "Read a coding system from the minibuffer, prompting with string PROMPT.")
3103   (prompt)
3104      Lisp_Object prompt;
3105 {
3106   Lisp_Object val;
3107   do
3108     {
3109       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3110                               Qt, Qnil, Qnil, Qnil, Qnil);
3111     }
3112   while (XSTRING (val)->size == 0);
3113   return (Fintern (val, Qnil));
3114 }
3115
3116 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
3117   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
3118 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
3119   (prompt, default_coding_system)
3120      Lisp_Object prompt, default_coding_system;
3121 {
3122   Lisp_Object val;
3123   if (SYMBOLP (default_coding_system))
3124     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
3125   val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3126                           Qt, Qnil, Qcoding_system_history,
3127                           default_coding_system, Qnil);
3128   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3129 }
3130
3131 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3132        1, 1, 0,
3133   "Check validity of CODING-SYSTEM.\n\
3134 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3135 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3136 The value of property should be a vector of length 5.")
3137   (coding_system)
3138      Lisp_Object coding_system;
3139 {
3140   CHECK_SYMBOL (coding_system, 0);
3141   if (!NILP (Fcoding_system_p (coding_system)))
3142     return coding_system;
3143   while (1)
3144     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3145 }
3146
3147 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3148        2, 2, 0,
3149   "Detect coding system of the text in the region between START and END.\n\
3150 Return a list of possible coding systems ordered by priority.\n\
3151 If only ASCII characters are found, it returns `undecided'\n\
3152  or its subsidiary coding system according to a detected end-of-line format.")
3153   (b, e)
3154      Lisp_Object b, e;
3155 {
3156   int coding_mask, eol_type;
3157   Lisp_Object val;
3158   int beg, end;
3159
3160   validate_region (&b, &e);
3161   beg = XINT (b), end = XINT (e);
3162   if (beg < GPT && end >= GPT) move_gap (end);
3163
3164   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3165   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
3166
3167   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3168     {
3169       val = Qundecided;
3170       if (eol_type != CODING_EOL_UNDECIDED
3171           && eol_type != CODING_EOL_INCONSISTENT)
3172         {
3173           Lisp_Object val2;
3174           val2 = Fget (Qundecided, Qeol_type);
3175           if (VECTORP (val2))
3176             val = XVECTOR (val2)->contents[eol_type];
3177         }
3178     }
3179   else
3180     {
3181       Lisp_Object val2;
3182
3183       /* At first, gather possible coding-systems in VAL in a reverse
3184          order.  */
3185       val = Qnil;
3186       for (val2 = Vcoding_category_list;
3187            !NILP (val2);
3188            val2 = XCONS (val2)->cdr)
3189         {
3190           int idx
3191             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3192           if (coding_mask & (1 << idx))
3193             {
3194 #if 0
3195               /* This code is suppressed until we find a better way to
3196                  distinguish raw text file and binary file.  */
3197
3198               if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3199                   && eol_type == CODING_EOL_INCONSISTENT)
3200                 val = Fcons (Qno_conversion, val);
3201               else
3202 #endif /* 0 */
3203                 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3204             }
3205         }
3206
3207       /* Then, change the order of the list, while getting subsidiary
3208          coding-systems.  */
3209       val2 = val;
3210       val = Qnil;
3211       if (eol_type == CODING_EOL_INCONSISTENT)
3212         eol_type == CODING_EOL_UNDECIDED;
3213       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3214         {
3215           if (eol_type == CODING_EOL_UNDECIDED)
3216             val = Fcons (XCONS (val2)->car, val);
3217           else
3218             {
3219               Lisp_Object val3;
3220               val3 = Fget (XCONS (val2)->car, Qeol_type);
3221               if (VECTORP (val3))
3222                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3223               else
3224                 val = Fcons (XCONS (val2)->car, val);
3225             }
3226         }
3227     }
3228
3229   return val;
3230 }
3231
3232 /* Scan text in the region between *BEGP and *ENDP, skip characters
3233    which we never have to encode to (iff ENCODEP is 1) or decode from
3234    coding system CODING at the head and tail, then set BEGP and ENDP
3235    to the addresses of start and end of the text we actually convert.  */
3236
3237 void
3238 shrink_conversion_area (begp, endp, coding, encodep)
3239      unsigned char **begp, **endp;
3240      struct coding_system *coding;
3241      int encodep;
3242 {
3243   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3244
3245   if (coding->eol_type != CODING_EOL_LF
3246       && coding->eol_type != CODING_EOL_UNDECIDED)
3247     /* Since we anyway have to convert end-of-line format, it is not
3248        worth skipping at most 100 bytes or so.  */
3249     return;
3250
3251   if (encodep)                  /* for encoding */
3252     {
3253       switch (coding->type)
3254         {
3255         case coding_type_no_conversion:
3256         case coding_type_emacs_mule:
3257         case coding_type_undecided:
3258         case coding_type_raw_text:
3259           /* We need no conversion.  */
3260           *begp = *endp;
3261           return;
3262         case coding_type_ccl:
3263           /* We can't skip any data.  */
3264           return;
3265         case coding_type_iso2022:
3266           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3267             {
3268               unsigned char *bol = beg_addr;
3269               while (beg_addr < end_addr && *beg_addr < 0x80)
3270                 {
3271                   beg_addr++;
3272                   if (*(beg_addr - 1) == '\n')
3273                     bol = beg_addr;
3274                 }
3275               beg_addr = bol;
3276               goto label_skip_tail;
3277             }
3278           /* fall down ... */
3279         default:
3280           /* We can skip all ASCII characters at the head and tail.  */
3281           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3282         label_skip_tail:
3283           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3284           break;
3285         }
3286     }
3287   else                          /* for decoding */
3288     {
3289       switch (coding->type)
3290         {
3291         case coding_type_no_conversion:
3292           /* We need no conversion.  */
3293           *begp = *endp;
3294           return;
3295         case coding_type_emacs_mule:
3296         case coding_type_raw_text:
3297           if (coding->eol_type == CODING_EOL_LF)
3298             {
3299               /* We need no conversion.  */
3300               *begp = *endp;
3301               return;
3302             }
3303           /* We can skip all but carriage-return.  */
3304           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3305           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3306           break;
3307         case coding_type_sjis:
3308         case coding_type_big5:
3309           /* We can skip all ASCII characters at the head.  */
3310           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3311           /* We can skip all ASCII characters at the tail except for
3312              the second byte of SJIS or BIG5 code.  */
3313           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3314           if (end_addr != *endp)
3315             end_addr++;
3316           break;
3317         case coding_type_ccl:
3318           /* We can't skip any data.  */
3319           return;
3320         default:                /* i.e. case coding_type_iso2022: */
3321           {
3322             unsigned char c;
3323
3324             /* We can skip all ASCII characters except for a few
3325                control codes at the head.  */
3326             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3327                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3328                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3329               beg_addr++;
3330           }
3331           break;
3332         }
3333     }
3334   *begp = beg_addr;
3335   *endp = end_addr;
3336   return;
3337 }
3338
3339 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3340    text between B and E.  B and E are buffer position.  */
3341
3342 Lisp_Object
3343 code_convert_region (b, e, coding, encodep)
3344      Lisp_Object b, e;
3345      struct coding_system *coding;
3346      int encodep;
3347 {
3348   int beg, end, len, consumed, produced;
3349   char *buf;
3350   unsigned char *begp, *endp;
3351   int pos = PT;
3352
3353   validate_region (&b, &e);
3354   beg = XINT (b), end = XINT (e);
3355   if (beg < GPT && end >= GPT)
3356     move_gap (end);
3357
3358   if (encodep && !NILP (coding->pre_write_conversion))
3359     {
3360       /* We must call a pre-conversion function which may put a new
3361          text to be converted in a new buffer.  */
3362       struct buffer *old = current_buffer, *new;
3363
3364       TEMP_SET_PT (beg);
3365       call2 (coding->pre_write_conversion, b, e);
3366       if (old != current_buffer)
3367         {
3368           /* Replace the original text by the text just generated.  */
3369           len = ZV - BEGV;
3370           new = current_buffer;
3371           set_buffer_internal (old);
3372           del_range (beg, end);
3373           insert_from_buffer (new, 1, len, 0);
3374           end = beg + len;
3375         }
3376     }
3377
3378   /* We may be able to shrink the conversion region.  */
3379   begp = POS_ADDR (beg); endp = begp + (end - beg);
3380   shrink_conversion_area (&begp, &endp, coding, encodep);
3381
3382   if (begp == endp)
3383     /* We need no conversion.  */
3384     len = end - beg;
3385   else
3386     {
3387       beg += begp - POS_ADDR (beg);
3388       end =  beg + (endp - begp);
3389
3390       if (encodep)
3391         len = encoding_buffer_size (coding, end - beg);
3392       else
3393         len = decoding_buffer_size (coding, end - beg);
3394       buf = get_conversion_buffer (len);
3395
3396       coding->last_block = 1;
3397       produced = (encodep
3398                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3399                                    &consumed)
3400                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3401                                    &consumed));
3402
3403       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3404
3405       TEMP_SET_PT (beg);
3406       insert (buf, produced);
3407       del_range (PT, PT + end - beg);
3408       if (pos >= end)
3409         pos = PT + (pos - end);
3410       else if (pos > beg)
3411         pos = beg;
3412       TEMP_SET_PT (pos);
3413   }
3414
3415   if (!encodep && !NILP (coding->post_read_conversion))
3416     {
3417       /* We must call a post-conversion function which may alter
3418          the text just converted.  */
3419       Lisp_Object insval;
3420
3421       beg = XINT (b);
3422       TEMP_SET_PT (beg);
3423       insval = call1 (coding->post_read_conversion, make_number (len));
3424       CHECK_NUMBER (insval, 0);
3425       len = XINT (insval);
3426     }
3427
3428   return make_number (len);
3429 }
3430
3431 Lisp_Object
3432 code_convert_string (str, coding, encodep, nocopy)
3433      Lisp_Object str, nocopy;
3434      struct coding_system *coding;
3435      int encodep;
3436 {
3437   int len, consumed, produced;
3438   char *buf;
3439   unsigned char *begp, *endp;
3440   int head_skip, tail_skip;
3441   struct gcpro gcpro1;
3442
3443   if (encodep && !NILP (coding->pre_write_conversion)
3444       || !encodep && !NILP (coding->post_read_conversion))
3445     {
3446       /* Since we have to call Lisp functions which assume target text
3447          is in a buffer, after setting a temporary buffer, call
3448          code_convert_region.  */
3449       int count = specpdl_ptr - specpdl;
3450       int len = XSTRING (str)->size;
3451       Lisp_Object result;
3452       struct buffer *old = current_buffer;
3453
3454       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3455       temp_output_buffer_setup (" *code-converting-work*");
3456       set_buffer_internal (XBUFFER (Vstandard_output));
3457       insert_from_string (str, 0, len, 0);
3458       code_convert_region (make_number (BEGV), make_number (ZV),
3459                            coding, encodep);
3460       result = make_buffer_string (BEGV, ZV, 0);
3461       set_buffer_internal (old);
3462       return unbind_to (count, result);
3463     }
3464
3465   /* We may be able to shrink the conversion region.  */
3466   begp = XSTRING (str)->data;
3467   endp = begp + XSTRING (str)->size;
3468   shrink_conversion_area (&begp, &endp, coding, encodep);
3469
3470   if (begp == endp)
3471     /* We need no conversion.  */
3472     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3473
3474   head_skip = begp - XSTRING (str)->data;
3475   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3476
3477   GCPRO1 (str);
3478
3479   if (encodep)
3480     len = encoding_buffer_size (coding, endp - begp);
3481   else
3482     len = decoding_buffer_size (coding, endp - begp);
3483   buf = get_conversion_buffer (len + head_skip + tail_skip);
3484
3485   bcopy (XSTRING (str)->data, buf, head_skip);
3486   coding->last_block = 1;
3487   produced = (encodep
3488               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3489                                buf + head_skip, endp - begp, len, &consumed)
3490               : decode_coding (coding, XSTRING (str)->data + head_skip,
3491                                buf + head_skip, endp - begp, len, &consumed));
3492   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3493          buf + head_skip + produced,
3494          tail_skip);
3495
3496   UNGCPRO;
3497
3498   return make_string (buf, head_skip + produced + tail_skip);
3499 }
3500
3501 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3502        3, 3, "r\nzCoding system: ",
3503   "Decode current region by specified coding system.\n\
3504 When called from a program, takes three arguments:\n\
3505 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3506 Return length of decoded text.")
3507   (b, e, coding_system)
3508      Lisp_Object b, e, coding_system;
3509 {
3510   struct coding_system coding;
3511
3512   CHECK_NUMBER_COERCE_MARKER (b, 0);
3513   CHECK_NUMBER_COERCE_MARKER (e, 1);
3514   CHECK_SYMBOL (coding_system, 2);
3515
3516   if (NILP (coding_system))
3517     return make_number (XFASTINT (e) - XFASTINT (b));
3518   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3519     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3520
3521   return code_convert_region (b, e, &coding, 0);
3522 }
3523
3524 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3525        3, 3, "r\nzCoding system: ",
3526   "Encode current region by specified coding system.\n\
3527 When called from a program, takes three arguments:\n\
3528 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3529 Return length of encoded text.")
3530   (b, e, coding_system)
3531      Lisp_Object b, e, coding_system;
3532 {
3533   struct coding_system coding;
3534
3535   CHECK_NUMBER_COERCE_MARKER (b, 0);
3536   CHECK_NUMBER_COERCE_MARKER (e, 1);
3537   CHECK_SYMBOL (coding_system, 2);
3538
3539   if (NILP (coding_system))
3540     return make_number (XFASTINT (e) - XFASTINT (b));
3541   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3542     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3543
3544   return code_convert_region (b, e, &coding, 1);
3545 }
3546
3547 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3548        2, 3, 0,
3549   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3550 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3551 if the decoding operation is trivial.")
3552   (string, coding_system, nocopy)
3553      Lisp_Object string, coding_system, nocopy;
3554 {
3555   struct coding_system coding;
3556
3557   CHECK_STRING (string, 0);
3558   CHECK_SYMBOL (coding_system, 1);
3559
3560   if (NILP (coding_system))
3561     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3562   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3563     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3564
3565   return code_convert_string (string, &coding, 0, nocopy);
3566 }
3567
3568 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3569        2, 3, 0,
3570   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3571 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3572 if the encoding operation is trivial.")
3573   (string, coding_system, nocopy)
3574      Lisp_Object string, coding_system, nocopy;
3575 {
3576   struct coding_system coding;
3577
3578   CHECK_STRING (string, 0);
3579   CHECK_SYMBOL (coding_system, 1);
3580
3581   if (NILP (coding_system))
3582     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3583   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3584     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3585
3586   return code_convert_string (string, &coding, 1, nocopy);
3587 }
3588
3589 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3590   "Decode a JISX0208 character of shift-jis encoding.\n\
3591 CODE is the character code in SJIS.\n\
3592 Return the corresponding character.")
3593   (code)
3594      Lisp_Object code;
3595 {
3596   unsigned char c1, c2, s1, s2;
3597   Lisp_Object val;
3598
3599   CHECK_NUMBER (code, 0);
3600   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3601   DECODE_SJIS (s1, s2, c1, c2);
3602   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3603   return val;
3604 }
3605
3606 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3607   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3608 Return the corresponding character code in SJIS.")
3609   (ch)
3610      Lisp_Object ch;
3611 {
3612   int charset, c1, c2, s1, s2;
3613   Lisp_Object val;
3614
3615   CHECK_NUMBER (ch, 0);
3616   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3617   if (charset == charset_jisx0208)
3618     {
3619       ENCODE_SJIS (c1, c2, s1, s2);
3620       XSETFASTINT (val, (s1 << 8) | s2);
3621     }
3622   else
3623     XSETFASTINT (val, 0);
3624   return val;
3625 }
3626
3627 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3628   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3629 CODE is the character code in BIG5.\n\
3630 Return the corresponding character.")
3631   (code)
3632      Lisp_Object code;
3633 {
3634   int charset;
3635   unsigned char b1, b2, c1, c2;
3636   Lisp_Object val;
3637
3638   CHECK_NUMBER (code, 0);
3639   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3640   DECODE_BIG5 (b1, b2, charset, c1, c2);
3641   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3642   return val;
3643 }
3644
3645 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3646   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3647 Return the corresponding character code in Big5.")
3648   (ch)
3649      Lisp_Object ch;
3650 {
3651   int charset, c1, c2, b1, b2;
3652   Lisp_Object val;
3653
3654   CHECK_NUMBER (ch, 0);
3655   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3656   if (charset == charset_big5_1 || charset == charset_big5_2)
3657     {
3658       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3659       XSETFASTINT (val, (b1 << 8) | b2);
3660     }
3661   else
3662     XSETFASTINT (val, 0);
3663   return val;
3664 }
3665
3666 DEFUN ("set-terminal-coding-system-internal",
3667        Fset_terminal_coding_system_internal,
3668        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3669   (coding_system)
3670      Lisp_Object coding_system;
3671 {
3672   CHECK_SYMBOL (coding_system, 0);
3673   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3674   /* We had better not send unexpected characters to terminal.  */
3675   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3676
3677   return Qnil;
3678 }
3679
3680 DEFUN ("set-safe-terminal-coding-system-internal",
3681        Fset_safe_terminal_coding_system_internal,
3682        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3683   (coding_system)
3684      Lisp_Object coding_system;
3685 {
3686   CHECK_SYMBOL (coding_system, 0);
3687   setup_coding_system (Fcheck_coding_system (coding_system),
3688                        &safe_terminal_coding);
3689   return Qnil;
3690 }
3691
3692 DEFUN ("terminal-coding-system",
3693        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3694   "Return coding-system of your terminal.")
3695   ()
3696 {
3697   return terminal_coding.symbol;
3698 }
3699
3700 DEFUN ("set-keyboard-coding-system-internal",
3701        Fset_keyboard_coding_system_internal,
3702        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3703   (coding_system)
3704      Lisp_Object coding_system;
3705 {
3706   CHECK_SYMBOL (coding_system, 0);
3707   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3708   return Qnil;
3709 }
3710
3711 DEFUN ("keyboard-coding-system",
3712        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3713   "Return coding-system of what is sent from terminal keyboard.")
3714   ()
3715 {
3716   return keyboard_coding.symbol;
3717 }
3718
3719 \f
3720 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3721        Sfind_operation_coding_system,  1, MANY, 0,
3722   "Choose a coding system for an operation based on the target name.\n\
3723 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3724 DECODING-SYSTEM is the coding system to use for decoding\n\
3725 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3726 for encoding (in case OPERATION does encoding).\n\
3727 \n\
3728 The first argument OPERATION specifies an I/O primitive:\n\
3729   For file I/O, `insert-file-contents' or `write-region'.\n\
3730   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3731   For network I/O, `open-network-stream'.\n\
3732 \n\
3733 The remaining arguments should be the same arguments that were passed\n\
3734 to the primitive.  Depending on which primitive, one of those arguments\n\
3735 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3736 whichever argument specifies the file name is TARGET.\n\
3737 \n\
3738 TARGET has a meaning which depends on OPERATION:\n\
3739   For file I/O, TARGET is a file name.\n\
3740   For process I/O, TARGET is a process name.\n\
3741   For network I/O, TARGET is a service name or a port number\n\
3742 \n\
3743 This function looks up what specified for TARGET in,\n\
3744 `file-coding-system-alist', `process-coding-system-alist',\n\
3745 or `network-coding-system-alist' depending on OPERATION.\n\
3746 They may specify a coding system, a cons of coding systems,\n\
3747 or a function symbol to call.\n\
3748 In the last case, we call the function with one argument,\n\
3749 which is a list of all the arguments given to this function.")
3750   (nargs, args)
3751      int nargs;
3752      Lisp_Object *args;
3753 {
3754   Lisp_Object operation, target_idx, target, val;
3755   register Lisp_Object chain;
3756
3757   if (nargs < 2)
3758     error ("Too few arguments");
3759   operation = args[0];
3760   if (!SYMBOLP (operation)
3761       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3762     error ("Invalid first arguement");
3763   if (nargs < 1 + XINT (target_idx))
3764     error ("Too few arguments for operation: %s",
3765            XSYMBOL (operation)->name->data);
3766   target = args[XINT (target_idx) + 1];
3767   if (!(STRINGP (target)
3768         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3769     error ("Invalid %dth argument", XINT (target_idx) + 1);
3770
3771   chain = ((EQ (operation, Qinsert_file_contents)
3772             || EQ (operation, Qwrite_region))
3773            ? Vfile_coding_system_alist
3774            : (EQ (operation, Qopen_network_stream)
3775               ? Vnetwork_coding_system_alist
3776               : Vprocess_coding_system_alist));
3777   if (NILP (chain))
3778     return Qnil;
3779
3780   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3781     {
3782       Lisp_Object elt;
3783       elt = XCONS (chain)->car;
3784
3785       if (CONSP (elt)
3786           && ((STRINGP (target)
3787                && STRINGP (XCONS (elt)->car)
3788                && fast_string_match (XCONS (elt)->car, target) >= 0)
3789               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3790         {
3791           val = XCONS (elt)->cdr;
3792           /* Here, if VAL is both a valid coding system and a valid
3793              function symbol, we return VAL as a coding system.  */
3794           if (CONSP (val))
3795             return val;
3796           if (! SYMBOLP (val))
3797             return Qnil;
3798           if (! NILP (Fcoding_system_p (val)))
3799             return Fcons (val, val);
3800           if (! NILP (Ffboundp (val)))
3801             {
3802               val = call1 (val, Flist (nargs, args));
3803               if (CONSP (val))
3804                 return val;
3805               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
3806                 return Fcons (val, val);
3807             }
3808           return Qnil;
3809         }
3810     }
3811   return Qnil;
3812 }
3813
3814 #endif /* emacs */
3815
3816 \f
3817 /*** 8. Post-amble ***/
3818
3819 init_coding_once ()
3820 {
3821   int i;
3822
3823   /* Emacs' internal format specific initialize routine.  */
3824   for (i = 0; i <= 0x20; i++)
3825     emacs_code_class[i] = EMACS_control_code;
3826   emacs_code_class[0x0A] = EMACS_linefeed_code;
3827   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3828   for (i = 0x21 ; i < 0x7F; i++)
3829     emacs_code_class[i] = EMACS_ascii_code;
3830   emacs_code_class[0x7F] = EMACS_control_code;
3831   emacs_code_class[0x80] = EMACS_leading_code_composition;
3832   for (i = 0x81; i < 0xFF; i++)
3833     emacs_code_class[i] = EMACS_invalid_code;
3834   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3835   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3836   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3837   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3838
3839   /* ISO2022 specific initialize routine.  */
3840   for (i = 0; i < 0x20; i++)
3841     iso_code_class[i] = ISO_control_code;
3842   for (i = 0x21; i < 0x7F; i++)
3843     iso_code_class[i] = ISO_graphic_plane_0;
3844   for (i = 0x80; i < 0xA0; i++)
3845     iso_code_class[i] = ISO_control_code;
3846   for (i = 0xA1; i < 0xFF; i++)
3847     iso_code_class[i] = ISO_graphic_plane_1;
3848   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3849   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3850   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3851   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3852   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3853   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3854   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3855   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3856   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3857   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3858
3859   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3860   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3861
3862   setup_coding_system (Qnil, &keyboard_coding);
3863   setup_coding_system (Qnil, &terminal_coding);
3864   setup_coding_system (Qnil, &safe_terminal_coding);
3865
3866 #if defined (MSDOS) || defined (WINDOWSNT)
3867   system_eol_type = CODING_EOL_CRLF;
3868 #else
3869   system_eol_type = CODING_EOL_LF;
3870 #endif
3871 }
3872
3873 #ifdef emacs
3874
3875 syms_of_coding ()
3876 {
3877   Qtarget_idx = intern ("target-idx");
3878   staticpro (&Qtarget_idx);
3879
3880   Qcoding_system_history = intern ("coding-system-history");
3881   staticpro (&Qcoding_system_history);
3882   Fset (Qcoding_system_history, Qnil);
3883
3884   /* Target FILENAME is the first argument.  */
3885   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3886   /* Target FILENAME is the third argument.  */
3887   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3888
3889   Qcall_process = intern ("call-process");
3890   staticpro (&Qcall_process);
3891   /* Target PROGRAM is the first argument.  */
3892   Fput (Qcall_process, Qtarget_idx, make_number (0));
3893
3894   Qcall_process_region = intern ("call-process-region");
3895   staticpro (&Qcall_process_region);
3896   /* Target PROGRAM is the third argument.  */
3897   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3898
3899   Qstart_process = intern ("start-process");
3900   staticpro (&Qstart_process);
3901   /* Target PROGRAM is the third argument.  */
3902   Fput (Qstart_process, Qtarget_idx, make_number (2));
3903
3904   Qopen_network_stream = intern ("open-network-stream");
3905   staticpro (&Qopen_network_stream);
3906   /* Target SERVICE is the fourth argument.  */
3907   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3908
3909   Qcoding_system = intern ("coding-system");
3910   staticpro (&Qcoding_system);
3911
3912   Qeol_type = intern ("eol-type");
3913   staticpro (&Qeol_type);
3914
3915   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3916   staticpro (&Qbuffer_file_coding_system);
3917
3918   Qpost_read_conversion = intern ("post-read-conversion");
3919   staticpro (&Qpost_read_conversion);
3920
3921   Qpre_write_conversion = intern ("pre-write-conversion");
3922   staticpro (&Qpre_write_conversion);
3923
3924   Qno_conversion = intern ("no-conversion");
3925   staticpro (&Qno_conversion);
3926
3927   Qundecided = intern ("undecided");
3928   staticpro (&Qundecided);
3929
3930   Qcoding_system_spec = intern ("coding-system-spec");
3931   staticpro (&Qcoding_system_spec);
3932
3933   Qcoding_system_p = intern ("coding-system-p");
3934   staticpro (&Qcoding_system_p);
3935
3936   Qcoding_system_error = intern ("coding-system-error");
3937   staticpro (&Qcoding_system_error);
3938
3939   Fput (Qcoding_system_error, Qerror_conditions,
3940         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3941   Fput (Qcoding_system_error, Qerror_message,
3942         build_string ("Invalid coding system"));
3943
3944   Qcoding_category_index = intern ("coding-category-index");
3945   staticpro (&Qcoding_category_index);
3946
3947   {
3948     int i;
3949     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3950       {
3951         coding_category_table[i] = intern (coding_category_name[i]);
3952         staticpro (&coding_category_table[i]);
3953         Fput (coding_category_table[i], Qcoding_category_index,
3954               make_number (i));
3955       }
3956   }
3957
3958   Qcharacter_unification_table = intern ("character-unification-table");
3959   staticpro (&Qcharacter_unification_table);
3960   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3961         make_number (0));
3962
3963   Qcharacter_unification_table_for_decode
3964     = intern ("character-unification-table-for-decode");
3965   staticpro (&Qcharacter_unification_table_for_decode);
3966
3967   Qcharacter_unification_table_for_encode
3968     = intern ("character-unification-table-for-encode");
3969   staticpro (&Qcharacter_unification_table_for_encode);
3970
3971   Qemacs_mule = intern ("emacs-mule");
3972   staticpro (&Qemacs_mule);
3973
3974   defsubr (&Scoding_system_spec);
3975   defsubr (&Scoding_system_p);
3976   defsubr (&Sread_coding_system);
3977   defsubr (&Sread_non_nil_coding_system);
3978   defsubr (&Scheck_coding_system);
3979   defsubr (&Sdetect_coding_region);
3980   defsubr (&Sdecode_coding_region);
3981   defsubr (&Sencode_coding_region);
3982   defsubr (&Sdecode_coding_string);
3983   defsubr (&Sencode_coding_string);
3984   defsubr (&Sdecode_sjis_char);
3985   defsubr (&Sencode_sjis_char);
3986   defsubr (&Sdecode_big5_char);
3987   defsubr (&Sencode_big5_char);
3988   defsubr (&Sset_terminal_coding_system_internal);
3989   defsubr (&Sset_safe_terminal_coding_system_internal);
3990   defsubr (&Sterminal_coding_system);
3991   defsubr (&Sset_keyboard_coding_system_internal);
3992   defsubr (&Skeyboard_coding_system);
3993   defsubr (&Sfind_operation_coding_system);
3994
3995   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3996     "List of coding-categories (symbols) ordered by priority.");
3997   {
3998     int i;
3999
4000     Vcoding_category_list = Qnil;
4001     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
4002       Vcoding_category_list
4003         = Fcons (coding_category_table[i], Vcoding_category_list);
4004   }
4005
4006   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
4007     "Specify the coding system for read operations.\n\
4008 It is useful to bind this variable with `let', but do not set it globally.\n\
4009 If the value is a coding system, it is used for decoding on read operation.\n\
4010 If not, an appropriate element is used from one of the coding system alists:\n\
4011 There are three such tables, `file-coding-system-alist',\n\
4012 `process-coding-system-alist', and `network-coding-system-alist'.");
4013   Vcoding_system_for_read = Qnil;
4014
4015   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
4016     "Specify the coding system for write operations.\n\
4017 It is useful to bind this variable with `let', but do not set it globally.\n\
4018 If the value is a coding system, it is used for encoding on write operation.\n\
4019 If not, an appropriate element is used from one of the coding system alists:\n\
4020 There are three such tables, `file-coding-system-alist',\n\
4021 `process-coding-system-alist', and `network-coding-system-alist'.");
4022   Vcoding_system_for_write = Qnil;
4023
4024   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4025     "Coding system used in the latest file or process I/O.");
4026   Vlast_coding_system_used = Qnil;
4027
4028   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4029     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4030   inhibit_eol_conversion = 0;
4031
4032   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4033     "Alist to decide a coding system to use for a file I/O operation.\n\
4034 The format is ((PATTERN . VAL) ...),\n\
4035 where PATTERN is a regular expression matching a file name,\n\
4036 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4037 If VAL is a coding system, it is used for both decoding and encoding\n\
4038 the file contents.\n\
4039 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4040 and the cdr part is used for encoding.\n\
4041 If VAL is a function symbol, the function must return a coding system\n\
4042 or a cons of coding systems which are used as above.\n\
4043 \n\
4044 See also the function `find-operation-coding-system'.");
4045   Vfile_coding_system_alist = Qnil;
4046
4047   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4048     "Alist to decide a coding system to use for a process I/O operation.\n\
4049 The format is ((PATTERN . VAL) ...),\n\
4050 where PATTERN is a regular expression matching a program name,\n\
4051 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4052 If VAL is a coding system, it is used for both decoding what received\n\
4053 from the program and encoding what sent to the program.\n\
4054 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4055 and the cdr part is used for encoding.\n\
4056 If VAL is a function symbol, the function must return a coding system\n\
4057 or a cons of coding systems which are used as above.\n\
4058 \n\
4059 See also the function `find-operation-coding-system'.");
4060   Vprocess_coding_system_alist = Qnil;
4061
4062   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4063     "Alist to decide a coding system to use for a network I/O operation.\n\
4064 The format is ((PATTERN . VAL) ...),\n\
4065 where PATTERN is a regular expression matching a network service name\n\
4066 or is a port number to connect to,\n\
4067 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4068 If VAL is a coding system, it is used for both decoding what received\n\
4069 from the network stream and encoding what sent to the network stream.\n\
4070 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4071 and the cdr part is used for encoding.\n\
4072 If VAL is a function symbol, the function must return a coding system\n\
4073 or a cons of coding systems which are used as above.\n\
4074 \n\
4075 See also the function `find-operation-coding-system'.");
4076   Vnetwork_coding_system_alist = Qnil;
4077
4078   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4079     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4080   eol_mnemonic_unix = ':';
4081
4082   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4083     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4084   eol_mnemonic_dos = '\\';
4085
4086   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4087     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4088   eol_mnemonic_mac = '/';
4089
4090   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4091     "Mnemonic character indicating end-of-line format is not yet decided.");
4092   eol_mnemonic_undecided = ':';
4093
4094   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4095     "Non-nil means ISO 2022 encoder/decoder do character unification.");
4096   Venable_character_unification = Qt;
4097
4098   DEFVAR_LISP ("standard-character-unification-table-for-decode",
4099     &Vstandard_character_unification_table_for_decode,
4100     "Table for unifying characters when reading.");
4101   Vstandard_character_unification_table_for_decode = Qnil;
4102
4103   DEFVAR_LISP ("standard-character-unification-table-for-encode",
4104     &Vstandard_character_unification_table_for_encode,
4105     "Table for unifying characters when writing.");
4106   Vstandard_character_unification_table_for_encode = Qnil;
4107
4108   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4109     "Alist of charsets vs revision numbers.\n\
4110 While encoding, if a charset (car part of an element) is found,\n\
4111 designate it with the escape sequence identifing revision (cdr part of the element).");
4112   Vcharset_revision_alist = Qnil;
4113
4114   DEFVAR_LISP ("default-process-coding-system",
4115                &Vdefault_process_coding_system,
4116     "Cons of coding systems used for process I/O by default.\n\
4117 The car part is used for decoding a process output,\n\
4118 the cdr part is used for encoding a text to be sent to a process.");
4119   Vdefault_process_coding_system = Qnil;
4120
4121   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4122     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4123 This is a vector of length 256.\n\
4124 If Nth element is non-nil, the existence of code N in a file\n\
4125 \(or output of subprocess) doesn't prevent it to be detected as\n\
4126 a coding system of ISO 2022 variant which has a flag\n\
4127 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4128 or reading output of a subprocess.\n\
4129 Only 128th through 159th elements has a meaning.");
4130   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4131 }
4132
4133 #endif /* emacs */