src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Raw text
  71
  72   A coding system to for a text containing random 8-bit code.  Emacs
  73   does no code conversion on such a text except for end-of-line
  74   format.
  75
  76   5. Other
  77
  78   If a user wants to read/write a text encoded in a coding system not
  79   listed above, he can supply a decoder and an encoder for it in CCL
  80   (Code Conversion Language) programs.  Emacs executes the CCL program
  81   while reading/writing.
  82
  83   Emacs represents a coding-system by a Lisp symbol that has a property
  84   `coding-system'.  But, before actually using the coding-system, the
  85   information about it is set in a structure of type `struct
  86   coding_system' for rapid processing.  See section 6 for more details.
  87
  88 */
  89
  90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  91
  92   How end-of-line of a text is encoded depends on a system.  For
  93   instance, Unix's format is just one byte of `line-feed' code,
  94   whereas DOS's format is two-byte sequence of `carriage-return' and
  95   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  96
  97   Since text characters encoding and end-of-line encoding are
  98   independent, any coding system described above can take
  99   any format of end-of-line.  So, Emacs has information of format of
 100   end-of-line in each coding-system.  See section 6 for more details.
 101
 102 */
 103
 104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 105
 106   These functions check if a text between SRC and SRC_END is encoded
 107   in the coding system category XXX.  Each returns an integer value in
 108   which appropriate flag bits for the category XXX is set.  The flag
 109   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 110   template of these functions.  */
 111 #if 0
 112 int
 113 detect_coding_emacs_mule (src, src_end)
 114      unsigned char *src, *src_end;
 115 {
 116   ...
 117 }
 118 #endif
 119
 120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 121
 122   These functions decode SRC_BYTES length text at SOURCE encoded in
 123   CODING to Emacs' internal format (emacs-mule).  The resulting text
 124   goes to a place pointed to by DESTINATION, the length of which should
 125   not exceed DST_BYTES.  The number of bytes actually processed is
 126   returned as *CONSUMED.  The return value is the length of the decoded
 127   text.  Below is a template of these functions.  */
 128 #if 0
 129 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 130      struct coding_system *coding;
 131      unsigned char *source, *destination;
 132      int src_bytes, dst_bytes;
 133      int *consumed;
 134 {
 135   ...
 136 }
 137 #endif
 138
 139 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 140
 141   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 142   internal format (emacs-mule) to CODING.  The resulting text goes to
 143   a place pointed to by DESTINATION, the length of which should not
 144   exceed DST_BYTES.  The number of bytes actually processed is
 145   returned as *CONSUMED.  The return value is the length of the
 146   encoded text.  Below is a template of these functions.  */
 147 #if 0
 148 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152      int *consumed;
 153 {
 154   ...
 155 }
 156 #endif
 157
 158 /*** COMMONLY USED MACROS ***/
 159
 160 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 161    THREE_MORE_BYTES safely get one, two, and three bytes from the
 162    source text respectively.  If there are not enough bytes in the
 163    source, they jump to `label_end_of_loop'.  The caller should set
 164    variables `src' and `src_end' to appropriate areas in advance.  */
 165
 166 #define ONE_MORE_BYTE(c1)       \
 167   do {                          \
 168     if (src < src_end)          \
 169       c1 = *src++;              \
 170     else                        \
 171       goto label_end_of_loop;   \
 172   } while (0)
 173
 174 #define TWO_MORE_BYTES(c1, c2)  \
 175   do {                          \
 176     if (src + 1 < src_end)      \
 177       c1 = *src++, c2 = *src++; \
 178     else                        \
 179       goto label_end_of_loop;   \
 180   } while (0)
 181
 182 #define THREE_MORE_BYTES(c1, c2, c3)            \
 183   do {                                          \
 184     if (src + 2 < src_end)                      \
 185       c1 = *src++, c2 = *src++, c3 = *src++;    \
 186     else                                        \
 187       goto label_end_of_loop;                   \
 188   } while (0)
 189
 190 /* The following three macros DECODE_CHARACTER_ASCII,
 191    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 192    the multi-byte form of a character of each class at the place
 193    pointed by `dst'.  The caller should set the variable `dst' to
 194    point to an appropriate area and the variable `coding' to point to
 195    the coding-system of the currently decoding text in advance.  */
 196
 197 /* Decode one ASCII character C.  */
 198
 199 #define DECODE_CHARACTER_ASCII(c)                               \
 200   do {                                                          \
 201     if (COMPOSING_P (coding->composing))                        \
 202       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 203     else                                                        \
 204       *dst++ = (c);                                             \
 205   } while (0)
 206
 207 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 208    position-code is C.  */
 209
 210 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 211   do {                                                                  \
 212     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 213     if (COMPOSING_P (coding->composing))                                \
 214       *dst++ = leading_code + 0x20;                                     \
 215     else                                                                \
 216       *dst++ = leading_code;                                            \
 217     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 218       *dst++ = leading_code;                                            \
 219     *dst++ = (c) | 0x80;                                                \
 220   } while (0)
 221
 222 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 223    position-codes are C1 and C2.  */
 224
 225 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 226   do {                                                  \
 227     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 228     *dst++ = (c2) | 0x80;                               \
 229   } while (0)
 230
 231 \f
 232 /*** 1. Preamble ***/
 233
 234 #include <stdio.h>
 235
 236 #ifdef emacs
 237
 238 #include <config.h>
 239 #include "lisp.h"
 240 #include "buffer.h"
 241 #include "charset.h"
 242 #include "ccl.h"
 243 #include "coding.h"
 244 #include "window.h"
 245
 246 #else  /* not emacs */
 247
 248 #include "mulelib.h"
 249
 250 #endif /* not emacs */
 251
 252 Lisp_Object Qcoding_system, Qeol_type;
 253 Lisp_Object Qbuffer_file_coding_system;
 254 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 255 Lisp_Object Qno_conversion, Qundecided;
 256 Lisp_Object Qcoding_system_history;
 257
 258 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 259 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 260 Lisp_Object Qstart_process, Qopen_network_stream;
 261 Lisp_Object Qtarget_idx;
 262
 263 /* Mnemonic character of each format of end-of-line.  */
 264 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 265 /* Mnemonic character to indicate format of end-of-line is not yet
 266    decided.  */
 267 int eol_mnemonic_undecided;
 268
 269 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 270    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 271 int system_eol_type;
 272
 273 #ifdef emacs
 274
 275 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 276
 277 /* Coding system emacs-mule is for converting only end-of-line format.  */
 278 Lisp_Object Qemacs_mule;
 279
 280 /* Coding-systems are handed between Emacs Lisp programs and C internal
 281    routines by the following three variables.  */
 282 /* Coding-system for reading files and receiving data from process.  */
 283 Lisp_Object Vcoding_system_for_read;
 284 /* Coding-system for writing files and sending data to process.  */
 285 Lisp_Object Vcoding_system_for_write;
 286 /* Coding-system actually used in the latest I/O.  */
 287 Lisp_Object Vlast_coding_system_used;
 288
 289 /* A vector of length 256 which contains information about special
 290    Latin codes (espepcially for dealing with Microsoft code).  */
 291 Lisp_Object Vlatin_extra_code_table;
 292
 293 /* Flag to inhibit code conversion of end-of-line format.  */
 294 int inhibit_eol_conversion;
 295
 296 /* Coding system to be used to encode text for terminal display.  */
 297 struct coding_system terminal_coding;
 298
 299 /* Coding system to be used to encode text for terminal display when
 300    terminal coding system is nil.  */
 301 struct coding_system safe_terminal_coding;
 302
 303 /* Coding system of what is sent from terminal keyboard.  */
 304 struct coding_system keyboard_coding;
 305
 306 Lisp_Object Vfile_coding_system_alist;
 307 Lisp_Object Vprocess_coding_system_alist;
 308 Lisp_Object Vnetwork_coding_system_alist;
 309
 310 #endif /* emacs */
 311
 312 Lisp_Object Qcoding_category_index;
 313
 314 /* List of symbols `coding-category-xxx' ordered by priority.  */
 315 Lisp_Object Vcoding_category_list;
 316
 317 /* Table of coding-systems currently assigned to each coding-category.  */
 318 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 319
 320 /* Table of names of symbol for each coding-category.  */
 321 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 322   "coding-category-emacs-mule",
 323   "coding-category-sjis",
 324   "coding-category-iso-7",
 325   "coding-category-iso-8-1",
 326   "coding-category-iso-8-2",
 327   "coding-category-iso-7-else",
 328   "coding-category-iso-8-else",
 329   "coding-category-big5",
 330   "coding-category-raw-text",
 331   "coding-category-binary"
 332 };
 333
 334 /* Flag to tell if we look up unification table on character code
 335    conversion.  */
 336 Lisp_Object Venable_character_unification;
 337 /* Standard unification table to look up on decoding (reading).  */
 338 Lisp_Object Vstandard_character_unification_table_for_decode;
 339 /* Standard unification table to look up on encoding (writing).  */
 340 Lisp_Object Vstandard_character_unification_table_for_encode;
 341
 342 Lisp_Object Qcharacter_unification_table;
 343 Lisp_Object Qcharacter_unification_table_for_decode;
 344 Lisp_Object Qcharacter_unification_table_for_encode;
 345
 346 /* Alist of charsets vs revision number.  */
 347 Lisp_Object Vcharset_revision_alist;
 348
 349 /* Default coding systems used for process I/O.  */
 350 Lisp_Object Vdefault_process_coding_system;
 351
 352 \f
 353 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 354
 355 /* Emacs' internal format for encoding multiple character sets is a
 356    kind of multi-byte encoding, i.e. characters are encoded by
 357    variable-length sequences of one-byte codes.  ASCII characters
 358    and control characters (e.g. `tab', `newline') are represented by
 359    one-byte sequences which are their ASCII codes, in the range 0x00
 360    through 0x7F.  The other characters are represented by a sequence
 361    of `base leading-code', optional `extended leading-code', and one
 362    or two `position-code's.  The length of the sequence is determined
 363    by the base leading-code.  Leading-code takes the range 0x80
 364    through 0x9F, whereas extended leading-code and position-code take
 365    the range 0xA0 through 0xFF.  See `charset.h' for more details
 366    about leading-code and position-code.
 367
 368    There's one exception to this rule.  Special leading-code
 369    `leading-code-composition' denotes that the following several
 370    characters should be composed into one character.  Leading-codes of
 371    components (except for ASCII) are added 0x20.  An ASCII character
 372    component is represented by a 2-byte sequence of `0xA0' and
 373    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 374    details of composite character.  Hence, we can summarize the code
 375    range as follows:
 376
 377    --- CODE RANGE of Emacs' internal format ---
 378    (character set)      (range)
 379    ASCII                0x00 .. 0x7F
 380    ELSE (1st byte)      0x80 .. 0x9F
 381         (rest bytes)    0xA0 .. 0xFF
 382    ---------------------------------------------
 383
 384   */
 385
 386 enum emacs_code_class_type emacs_code_class[256];
 387
 388 /* Go to the next statement only if *SRC is accessible and the code is
 389    greater than 0xA0.  */
 390 #define CHECK_CODE_RANGE_A0_FF  \
 391   do {                          \
 392     if (src >= src_end)         \
 393       goto label_end_of_switch; \
 394     else if (*src++ < 0xA0)     \
 395       return 0;                 \
 396   } while (0)
 397
 398 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 399    Check if a text is encoded in Emacs' internal format.  If it is,
 400    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 401
 402 int
 403 detect_coding_emacs_mule (src, src_end)
 404      unsigned char *src, *src_end;
 405 {
 406   unsigned char c;
 407   int composing = 0;
 408
 409   while (src < src_end)
 410     {
 411       c = *src++;
 412
 413       if (composing)
 414         {
 415           if (c < 0xA0)
 416             composing = 0;
 417           else
 418             c -= 0x20;
 419         }
 420
 421       switch (emacs_code_class[c])
 422         {
 423         case EMACS_ascii_code:
 424         case EMACS_linefeed_code:
 425           break;
 426
 427         case EMACS_control_code:
 428           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 429             return 0;
 430           break;
 431
 432         case EMACS_invalid_code:
 433           return 0;
 434
 435         case EMACS_leading_code_composition: /* c == 0x80 */
 436           if (composing)
 437             CHECK_CODE_RANGE_A0_FF;
 438           else
 439             composing = 1;
 440           break;
 441
 442         case EMACS_leading_code_4:
 443           CHECK_CODE_RANGE_A0_FF;
 444           /* fall down to check it two more times ...  */
 445
 446         case EMACS_leading_code_3:
 447           CHECK_CODE_RANGE_A0_FF;
 448           /* fall down to check it one more time ...  */
 449
 450         case EMACS_leading_code_2:
 451           CHECK_CODE_RANGE_A0_FF;
 452           break;
 453
 454         default:
 455         label_end_of_switch:
 456           break;
 457         }
 458     }
 459   return CODING_CATEGORY_MASK_EMACS_MULE;
 460 }
 461
 462 \f
 463 /*** 3. ISO2022 handlers ***/
 464
 465 /* The following note describes the coding system ISO2022 briefly.
 466    Since the intention of this note is to help in understanding of
 467    the programs in this file, some parts are NOT ACCURATE or OVERLY
 468    SIMPLIFIED.  For the thorough understanding, please refer to the
 469    original document of ISO2022.
 470
 471    ISO2022 provides many mechanisms to encode several character sets
 472    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 473    all text is encoded by codes of less than 128.  This may make the
 474    encoded text a little bit longer, but the text gets more stability
 475    to pass through several gateways (some of them strip off the MSB).
 476
 477    There are two kinds of character set: control character set and
 478    graphic character set.  The former contains control characters such
 479    as `newline' and `escape' to provide control functions (control
 480    functions are provided also by escape sequences).  The latter
 481    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 482    two control character sets and many graphic character sets.
 483
 484    Graphic character sets are classified into one of the following
 485    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 486    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 487    bytes (DIMENSION) and the number of characters in one dimension
 488    (CHARS) of the set.  In addition, each character set is assigned an
 489    identification tag (called "final character" and denoted as <F>
 490    here after) which is unique in each class.  <F> of each character
 491    set is decided by ECMA(*) when it is registered in ISO.  Code range
 492    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 493
 494    Note (*): ECMA = European Computer Manufacturers Association
 495
 496    Here are examples of graphic character set [NAME(<F>)]:
 497         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 498         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 499         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 500         o DIMENSION2_CHARS96 -- none for the moment
 501
 502    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 503         C0 [0x00..0x1F] -- control character plane 0
 504         GL [0x20..0x7F] -- graphic character plane 0
 505         C1 [0x80..0x9F] -- control character plane 1
 506         GR [0xA0..0xFF] -- graphic character plane 1
 507
 508    A control character set is directly designated and invoked to C0 or
 509    C1 by an escape sequence.  The most common case is that ISO646's
 510    control character set is designated/invoked to C0 and ISO6429's
 511    control character set is designated/invoked to C1, and usually
 512    these designations/invocations are omitted in a coded text.  With
 513    7-bit environment, only C0 can be used, and a control character for
 514    C1 is encoded by an appropriate escape sequence to fit in the
 515    environment.  All control characters for C1 are defined the
 516    corresponding escape sequences.
 517
 518    A graphic character set is at first designated to one of four
 519    graphic registers (G0 through G3), then these graphic registers are
 520    invoked to GL or GR.  These designations and invocations can be
 521    done independently.  The most common case is that G0 is invoked to
 522    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 523    these invocations and designations are omitted in a coded text.
 524    With 7-bit environment, only GL can be used.
 525
 526    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 527    and 0x7F of GL area work as control characters SPACE and DEL
 528    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 529
 530    There are two ways of invocation: locking-shift and single-shift.
 531    With locking-shift, the invocation lasts until the next different
 532    invocation, whereas with single-shift, the invocation works only
 533    for the following character and doesn't affect locking-shift.
 534    Invocations are done by the following control characters or escape
 535    sequences.
 536
 537    ----------------------------------------------------------------------
 538    function             control char    escape sequence description
 539    ----------------------------------------------------------------------
 540    SI  (shift-in)               0x0F    none            invoke G0 to GL
 541    SO  (shift-out)              0x0E    none            invoke G1 to GL
 542    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 543    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 544    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 545    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 546    ----------------------------------------------------------------------
 547    The first four are for locking-shift.  Control characters for these
 548    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 549
 550    Designations are done by the following escape sequences.
 551    ----------------------------------------------------------------------
 552    escape sequence      description
 553    ----------------------------------------------------------------------
 554    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 555    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 556    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 557    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 558    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 559    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 560    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 561    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 562    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 563    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 564    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 565    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 566    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 567    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 568    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 569    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 570    ----------------------------------------------------------------------
 571
 572    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 573    of dimension 1, chars 94, and final character <F>, and etc.
 574
 575    Note (*): Although these designations are not allowed in ISO2022,
 576    Emacs accepts them on decoding, and produces them on encoding
 577    CHARS96 character set in a coding system which is characterized as
 578    7-bit environment, non-locking-shift, and non-single-shift.
 579
 580    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 581    '(' can be omitted.  We call this as "short-form" here after.
 582
 583    Now you may notice that there are a lot of ways for encoding the
 584    same multilingual text in ISO2022.  Actually, there exists many
 585    coding systems such as Compound Text (used in X's inter client
 586    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 587    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 588    localized platforms), and all of these are variants of ISO2022.
 589
 590    In addition to the above, Emacs handles two more kinds of escape
 591    sequences: ISO6429's direction specification and Emacs' private
 592    sequence for specifying character composition.
 593
 594    ISO6429's direction specification takes the following format:
 595         o CSI ']'      -- end of the current direction
 596         o CSI '0' ']'  -- end of the current direction
 597         o CSI '1' ']'  -- start of left-to-right text
 598         o CSI '2' ']'  -- start of right-to-left text
 599    The control character CSI (0x9B: control sequence introducer) is
 600    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 601
 602    Character composition specification takes the following format:
 603         o ESC '0' -- start character composition
 604         o ESC '1' -- end character composition
 605    Since these are not standard escape sequences of any ISO, the use
 606    of them for these meaning is restricted to Emacs only.  */
 607
 608 enum iso_code_class_type iso_code_class[256];
 609
 610 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 611    Check if a text is encoded in ISO2022.  If it is, returns an
 612    integer in which appropriate flag bits any of:
 613         CODING_CATEGORY_MASK_ISO_7
 614         CODING_CATEGORY_MASK_ISO_8_1
 615         CODING_CATEGORY_MASK_ISO_8_2
 616         CODING_CATEGORY_MASK_ISO_7_ELSE
 617         CODING_CATEGORY_MASK_ISO_8_ELSE
 618    are set.  If a code which should never appear in ISO2022 is found,
 619    returns 0.  */
 620
 621 int
 622 detect_coding_iso2022 (src, src_end)
 623      unsigned char *src, *src_end;
 624 {
 625   int mask = (CODING_CATEGORY_MASK_ISO_7
 626               | CODING_CATEGORY_MASK_ISO_8_1
 627               | CODING_CATEGORY_MASK_ISO_8_2
 628               | CODING_CATEGORY_MASK_ISO_7_ELSE
 629               | CODING_CATEGORY_MASK_ISO_8_ELSE
 630               );
 631   int g1 = 0;                   /* 1 iff designating to G1.  */
 632   int c, i;
 633   struct coding_system coding_iso_8_1, coding_iso_8_2;
 634
 635   /* Coding systems of these categories may accept latin extra codes.  */
 636   setup_coding_system
 637     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
 638      &coding_iso_8_1);
 639   setup_coding_system
 640     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
 641      &coding_iso_8_2);
 642
 643   while (mask && src < src_end)
 644     {
 645       c = *src++;
 646       switch (c)
 647         {
 648         case ISO_CODE_ESC:
 649           if (src >= src_end)
 650             break;
 651           c = *src++;
 652           if ((c >= '(' && c <= '/'))
 653             {
 654               /* Designation sequence for a charset of dimension 1.  */
 655               if (src >= src_end)
 656                 break;
 657               c = *src++;
 658               if (c < ' ' || c >= 0x80)
 659                 /* Invalid designation sequence.  */
 660                 return 0;
 661             }
 662           else if (c == '$')
 663             {
 664               /* Designation sequence for a charset of dimension 2.  */
 665               if (src >= src_end)
 666                 break;
 667               c = *src++;
 668               if (c >= '@' && c <= 'B')
 669                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 670                 ;
 671               else if (c >= '(' && c <= '/')
 672                 {
 673                   if (src >= src_end)
 674                     break;
 675                   c = *src++;
 676                   if (c < ' ' || c >= 0x80)
 677                     /* Invalid designation sequence.  */
 678                     return 0;
 679                 }
 680               else
 681                 /* Invalid designation sequence.  */
 682                 return 0;
 683             }
 684           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 685             /* Locking shift.  */
 686             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 687                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 688           else if (c == '0' || c == '1' || c == '2')
 689             /* Start/end composition.  */
 690             ;
 691           else
 692             /* Invalid escape sequence.  */
 693             return 0;
 694           break;
 695
 696         case ISO_CODE_SO:
 697           mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 698                    | CODING_CATEGORY_MASK_ISO_8_ELSE);
 699           break;
 700
 701         case ISO_CODE_CSI:
 702         case ISO_CODE_SS2:
 703         case ISO_CODE_SS3:
 704           {
 705             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 706
 707             if (VECTORP (Vlatin_extra_code_table)
 708                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 709               {
 710                 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 711                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 712                 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 713                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 714               }
 715             mask &= newmask;
 716           }
 717           break;
 718
 719         default:
 720           if (c < 0x80)
 721             break;
 722           else if (c < 0xA0)
 723             {
 724               if (VECTORP (Vlatin_extra_code_table)
 725                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 726                 {
 727                   int newmask = 0;
 728
 729                   if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 730                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 731                   if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 732                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 733                   mask &= newmask;
 734                 }
 735               else
 736                 return 0;
 737             }
 738           else
 739             {
 740               unsigned char *src_begin = src;
 741
 742               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 743                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 744               while (src < src_end && *src >= 0xA0)
 745                 src++;
 746               if ((src - src_begin - 1) & 1 && src < src_end)
 747                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 748             }
 749           break;
 750         }
 751     }
 752
 753   return mask;
 754 }
 755
 756 /* Decode a character of which charset is CHARSET and the 1st position
 757    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 758    fetched from SRC and set to C2.  If CHARSET is negative, it means
 759    that we are decoding ill formed text, and what we can do is just to
 760    read C1 as is.  */
 761
 762 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 763   do {                                                                  \
 764     int c_alt, charset_alt = (charset);                                 \
 765     if (COMPOSING_HEAD_P (coding->composing))                           \
 766       {                                                                 \
 767         *dst++ = LEADING_CODE_COMPOSITION;                              \
 768         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 769           /* To tell composition rules are embeded.  */                 \
 770           *dst++ = 0xFF;                                                \
 771         coding->composing += 2;                                         \
 772       }                                                                 \
 773     if ((charset) >= 0)                                                 \
 774       {                                                                 \
 775         if (CHARSET_DIMENSION (charset) == 2)                           \
 776           ONE_MORE_BYTE (c2);                                           \
 777         if (!NILP (unification_table)                                   \
 778             && ((c_alt = unify_char (unification_table,                 \
 779                                      -1, (charset), c1, c2)) >= 0))     \
 780           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 781       }                                                                 \
 782     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 783       DECODE_CHARACTER_ASCII (c1);                                      \
 784     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 785       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 786     else                                                                \
 787       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 788     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 789       /* To tell a composition rule follows.  */                        \
 790       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 791   } while (0)
 792
 793 /* Set designation state into CODING.  */
 794 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 795   do {                                                                  \
 796     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 797                                      make_number (chars),               \
 798                                      make_number (final_char));         \
 799     if (charset >= 0)                                                   \
 800       {                                                                 \
 801         if (coding->direction == 1                                      \
 802             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 803           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 804         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 805       }                                                                 \
 806   } while (0)
 807
 808 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 809
 810 int
 811 decode_coding_iso2022 (coding, source, destination,
 812                        src_bytes, dst_bytes, consumed)
 813      struct coding_system *coding;
 814      unsigned char *source, *destination;
 815      int src_bytes, dst_bytes;
 816      int *consumed;
 817 {
 818   unsigned char *src = source;
 819   unsigned char *src_end = source + src_bytes;
 820   unsigned char *dst = destination;
 821   unsigned char *dst_end = destination + dst_bytes;
 822   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 823      from DST_END to assure that overflow checking is necessary only
 824      at the head of loop.  */
 825   unsigned char *adjusted_dst_end = dst_end - 6;
 826   int charset;
 827   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 828   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 829   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 830   Lisp_Object unification_table
 831       = coding->character_unification_table_for_decode;
 832
 833   if (!NILP (Venable_character_unification) && NILP (unification_table))
 834     unification_table = Vstandard_character_unification_table_for_decode;
 835
 836   while (src < src_end && dst < adjusted_dst_end)
 837     {
 838       /* SRC_BASE remembers the start position in source in each loop.
 839          The loop will be exited when there's not enough source text
 840          to analyze long escape sequence or 2-byte code (within macros
 841          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 842          to SRC_BASE before exiting.  */
 843       unsigned char *src_base = src;
 844       int c1 = *src++, c2;
 845
 846       switch (iso_code_class [c1])
 847         {
 848         case ISO_0x20_or_0x7F:
 849           if (!coding->composing
 850               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 851             {
 852               /* This is SPACE or DEL.  */
 853               *dst++ = c1;
 854               break;
 855             }
 856           /* This is a graphic character, we fall down ...  */
 857
 858         case ISO_graphic_plane_0:
 859           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 860             {
 861               /* This is a composition rule.  */
 862               *dst++ = c1 | 0x80;
 863               coding->composing = COMPOSING_WITH_RULE_TAIL;
 864             }
 865           else
 866             DECODE_ISO_CHARACTER (charset0, c1);
 867           break;
 868
 869         case ISO_0xA0_or_0xFF:
 870           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 871             {
 872               /* Invalid code.  */
 873               *dst++ = c1;
 874               break;
 875             }
 876           /* This is a graphic character, we fall down ... */
 877
 878         case ISO_graphic_plane_1:
 879           DECODE_ISO_CHARACTER (charset1, c1);
 880           break;
 881
 882         case ISO_control_code:
 883           /* All ISO2022 control characters in this class have the
 884              same representation in Emacs internal format.  */
 885           *dst++ = c1;
 886           break;
 887
 888         case ISO_carriage_return:
 889           if (coding->eol_type == CODING_EOL_CR)
 890             {
 891               *dst++ = '\n';
 892             }
 893           else if (coding->eol_type == CODING_EOL_CRLF)
 894             {
 895               ONE_MORE_BYTE (c1);
 896               if (c1 == ISO_CODE_LF)
 897                 *dst++ = '\n';
 898               else
 899                 {
 900                   src--;
 901                   *dst++ = c1;
 902                 }
 903             }
 904           else
 905             {
 906               *dst++ = c1;
 907             }
 908           break;
 909
 910         case ISO_shift_out:
 911           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 912             goto label_invalid_escape_sequence;
 913           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 914           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 915           break;
 916
 917         case ISO_shift_in:
 918           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 919           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 920           break;
 921
 922         case ISO_single_shift_2_7:
 923         case ISO_single_shift_2:
 924           /* SS2 is handled as an escape sequence of ESC 'N' */
 925           c1 = 'N';
 926           goto label_escape_sequence;
 927
 928         case ISO_single_shift_3:
 929           /* SS2 is handled as an escape sequence of ESC 'O' */
 930           c1 = 'O';
 931           goto label_escape_sequence;
 932
 933         case ISO_control_sequence_introducer:
 934           /* CSI is handled as an escape sequence of ESC '[' ...  */
 935           c1 = '[';
 936           goto label_escape_sequence;
 937
 938         case ISO_escape:
 939           ONE_MORE_BYTE (c1);
 940         label_escape_sequence:
 941           /* Escape sequences handled by Emacs are invocation,
 942              designation, direction specification, and character
 943              composition specification.  */
 944           switch (c1)
 945             {
 946             case '&':           /* revision of following character set */
 947               ONE_MORE_BYTE (c1);
 948               if (!(c1 >= '@' && c1 <= '~'))
 949                 goto label_invalid_escape_sequence;
 950               ONE_MORE_BYTE (c1);
 951               if (c1 != ISO_CODE_ESC)
 952                 goto label_invalid_escape_sequence;
 953               ONE_MORE_BYTE (c1);
 954               goto label_escape_sequence;
 955
 956             case '$':           /* designation of 2-byte character set */
 957               ONE_MORE_BYTE (c1);
 958               if (c1 >= '@' && c1 <= 'B')
 959                 {       /* designation of JISX0208.1978, GB2312.1980,
 960                                    or JISX0208.1980 */
 961                   DECODE_DESIGNATION (0, 2, 94, c1);
 962                 }
 963               else if (c1 >= 0x28 && c1 <= 0x2B)
 964                 {       /* designation of DIMENSION2_CHARS94 character set */
 965                   ONE_MORE_BYTE (c2);
 966                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 967                 }
 968               else if (c1 >= 0x2C && c1 <= 0x2F)
 969                 {       /* designation of DIMENSION2_CHARS96 character set */
 970                   ONE_MORE_BYTE (c2);
 971                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 972                 }
 973               else
 974                 goto label_invalid_escape_sequence;
 975               break;
 976
 977             case 'n':           /* invocation of locking-shift-2 */
 978               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 979                 goto label_invalid_escape_sequence;
 980               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 981               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 982               break;
 983
 984             case 'o':           /* invocation of locking-shift-3 */
 985               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 986                 goto label_invalid_escape_sequence;
 987               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 988               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 989               break;
 990
 991             case 'N':           /* invocation of single-shift-2 */
 992               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 993                 goto label_invalid_escape_sequence;
 994               ONE_MORE_BYTE (c1);
 995               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 996               DECODE_ISO_CHARACTER (charset, c1);
 997               break;
 998
 999             case 'O':           /* invocation of single-shift-3 */
1000               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1001                 goto label_invalid_escape_sequence;
1002               ONE_MORE_BYTE (c1);
1003               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1004               DECODE_ISO_CHARACTER (charset, c1);
1005               break;
1006
1007             case '0':           /* start composing without embeded rules */
1008               coding->composing = COMPOSING_NO_RULE_HEAD;
1009               break;
1010
1011             case '1':           /* end composing */
1012               coding->composing = COMPOSING_NO;
1013               break;
1014
1015             case '2':           /* start composing with embeded rules */
1016               coding->composing = COMPOSING_WITH_RULE_HEAD;
1017               break;
1018
1019             case '[':           /* specification of direction */
1020               /* For the moment, nested direction is not supported.
1021                  So, the value of `coding->direction' is 0 or 1: 0
1022                  means left-to-right, 1 means right-to-left.  */
1023               ONE_MORE_BYTE (c1);
1024               switch (c1)
1025                 {
1026                 case ']':       /* end of the current direction */
1027                   coding->direction = 0;
1028
1029                 case '0':       /* end of the current direction */
1030                 case '1':       /* start of left-to-right direction */
1031                   ONE_MORE_BYTE (c1);
1032                   if (c1 == ']')
1033                     coding->direction = 0;
1034                   else
1035                     goto label_invalid_escape_sequence;
1036                   break;
1037
1038                 case '2':       /* start of right-to-left direction */
1039                   ONE_MORE_BYTE (c1);
1040                   if (c1 == ']')
1041                     coding->direction= 1;
1042                   else
1043                     goto label_invalid_escape_sequence;
1044                   break;
1045
1046                 default:
1047                   goto label_invalid_escape_sequence;
1048                 }
1049               break;
1050
1051             default:
1052               if (c1 >= 0x28 && c1 <= 0x2B)
1053                 {       /* designation of DIMENSION1_CHARS94 character set */
1054                   ONE_MORE_BYTE (c2);
1055                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1056                 }
1057               else if (c1 >= 0x2C && c1 <= 0x2F)
1058                 {       /* designation of DIMENSION1_CHARS96 character set */
1059                   ONE_MORE_BYTE (c2);
1060                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1061                 }
1062               else
1063                 {
1064                   goto label_invalid_escape_sequence;
1065                 }
1066             }
1067           /* We must update these variables now.  */
1068           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1069           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1070           break;
1071
1072         label_invalid_escape_sequence:
1073           {
1074             int length = src - src_base;
1075
1076             bcopy (src_base, dst, length);
1077             dst += length;
1078           }
1079         }
1080       continue;
1081
1082     label_end_of_loop:
1083       coding->carryover_size = src - src_base;
1084       bcopy (src_base, coding->carryover, coding->carryover_size);
1085       src = src_base;
1086       break;
1087     }
1088
1089   /* If this is the last block of the text to be decoded, we had
1090      better just flush out all remaining codes in the text although
1091      they are not valid characters.  */
1092   if (coding->last_block)
1093     {
1094       bcopy (src, dst, src_end - src);
1095       dst += (src_end - src);
1096       src = src_end;
1097     }
1098   *consumed = src - source;
1099   return dst - destination;
1100 }
1101
1102 /* ISO2022 encoding stuff.  */
1103
1104 /*
1105    It is not enough to say just "ISO2022" on encoding, we have to
1106    specify more details.  In Emacs, each coding-system of ISO2022
1107    variant has the following specifications:
1108         1. Initial designation to G0 thru G3.
1109         2. Allows short-form designation?
1110         3. ASCII should be designated to G0 before control characters?
1111         4. ASCII should be designated to G0 at end of line?
1112         5. 7-bit environment or 8-bit environment?
1113         6. Use locking-shift?
1114         7. Use Single-shift?
1115    And the following two are only for Japanese:
1116         8. Use ASCII in place of JIS0201-1976-Roman?
1117         9. Use JISX0208-1983 in place of JISX0208-1978?
1118    These specifications are encoded in `coding->flags' as flag bits
1119    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1120    details.
1121 */
1122
1123 /* Produce codes (escape sequence) for designating CHARSET to graphic
1124    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1125    the coding system CODING allows, produce designation sequence of
1126    short-form.  */
1127
1128 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1129   do {                                                                  \
1130     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1131     char *intermediate_char_94 = "()*+";                                \
1132     char *intermediate_char_96 = ",-./";                                \
1133     Lisp_Object temp                                                    \
1134       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1135     if (! NILP (temp))                                                  \
1136         {                                                               \
1137         *dst++ = ISO_CODE_ESC;                                          \
1138         *dst++ = '&';                                                   \
1139         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1140       }                                                                 \
1141     *dst++ = ISO_CODE_ESC;                                              \
1142     if (CHARSET_DIMENSION (charset) == 1)                               \
1143       {                                                                 \
1144         if (CHARSET_CHARS (charset) == 94)                              \
1145           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1146         else                                                            \
1147           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1148       }                                                                 \
1149     else                                                                \
1150       {                                                                 \
1151         *dst++ = '$';                                                   \
1152         if (CHARSET_CHARS (charset) == 94)                              \
1153           {                                                             \
1154             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1155                 || reg != 0                                             \
1156                 || final_char < '@' || final_char > 'B')                \
1157               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1158           }                                                             \
1159         else                                                            \
1160           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1161       }                                                                 \
1162     *dst++ = final_char;                                                \
1163     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1164   } while (0)
1165
1166 /* The following two macros produce codes (control character or escape
1167    sequence) for ISO2022 single-shift functions (single-shift-2 and
1168    single-shift-3).  */
1169
1170 #define ENCODE_SINGLE_SHIFT_2                           \
1171   do {                                                  \
1172     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1173       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1174     else                                                \
1175       *dst++ = ISO_CODE_SS2;                            \
1176     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1177   } while (0)
1178
1179 #define ENCODE_SINGLE_SHIFT_3                           \
1180   do {                                                  \
1181     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1182       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1183     else                                                \
1184       *dst++ = ISO_CODE_SS3;                            \
1185     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1186   } while (0)
1187
1188 /* The following four macros produce codes (control character or
1189    escape sequence) for ISO2022 locking-shift functions (shift-in,
1190    shift-out, locking-shift-2, and locking-shift-3).  */
1191
1192 #define ENCODE_SHIFT_IN                         \
1193   do {                                          \
1194     *dst++ = ISO_CODE_SI;                       \
1195     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1196   } while (0)
1197
1198 #define ENCODE_SHIFT_OUT                        \
1199   do {                                          \
1200     *dst++ = ISO_CODE_SO;                       \
1201     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1202   } while (0)
1203
1204 #define ENCODE_LOCKING_SHIFT_2                  \
1205   do {                                          \
1206     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1207     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1208   } while (0)
1209
1210 #define ENCODE_LOCKING_SHIFT_3                  \
1211   do {                                          \
1212     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1213     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1214   } while (0)
1215
1216 /* Produce codes for a DIMENSION1 character whose character set is
1217    CHARSET and whose position-code is C1.  Designation and invocation
1218    sequences are also produced in advance if necessary.  */
1219
1220
1221 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1222   do {                                                                  \
1223     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1224       {                                                                 \
1225         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1226           *dst++ = c1 & 0x7F;                                           \
1227         else                                                            \
1228           *dst++ = c1 | 0x80;                                           \
1229         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1230         break;                                                          \
1231       }                                                                 \
1232     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1233       {                                                                 \
1234         *dst++ = c1 & 0x7F;                                             \
1235         break;                                                          \
1236       }                                                                 \
1237     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1238       {                                                                 \
1239         *dst++ = c1 | 0x80;                                             \
1240         break;                                                          \
1241       }                                                                 \
1242     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1243              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1244       {                                                                 \
1245         /* We should not encode this character, instead produce one or  \
1246            two `?'s.  */                                                \
1247         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1248         if (CHARSET_WIDTH (charset) == 2)                               \
1249           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1250         break;                                                          \
1251       }                                                                 \
1252     else                                                                \
1253       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1254          must invoke it, or, at first, designate it to some graphic     \
1255          register.  Then repeat the loop to actually produce the        \
1256          character.  */                                                 \
1257       dst = encode_invocation_designation (charset, coding, dst);       \
1258   } while (1)
1259
1260 /* Produce codes for a DIMENSION2 character whose character set is
1261    CHARSET and whose position-codes are C1 and C2.  Designation and
1262    invocation codes are also produced in advance if necessary.  */
1263
1264 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1265   do {                                                                  \
1266     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1267       {                                                                 \
1268         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1269           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1270         else                                                            \
1271           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1272         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1273         break;                                                          \
1274       }                                                                 \
1275     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1276       {                                                                 \
1277         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1278         break;                                                          \
1279       }                                                                 \
1280     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1281       {                                                                 \
1282         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1283         break;                                                          \
1284       }                                                                 \
1285     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1286              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1287       {                                                                 \
1288         /* We should not encode this character, instead produce one or  \
1289            two `?'s.  */                                                \
1290         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1291         if (CHARSET_WIDTH (charset) == 2)                               \
1292           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1293         break;                                                          \
1294       }                                                                 \
1295     else                                                                \
1296       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1297          must invoke it, or, at first, designate it to some graphic     \
1298          register.  Then repeat the loop to actually produce the        \
1299          character.  */                                                 \
1300       dst = encode_invocation_designation (charset, coding, dst);       \
1301   } while (1)
1302
1303 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1304   do {                                                                    \
1305     int c_alt, charset_alt;                                               \
1306     if (!NILP (unification_table)                                         \
1307         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1308             >= 0))                                                        \
1309       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1310     else                                                                  \
1311       charset_alt = charset;                                              \
1312     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1313       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1314     else                                                                  \
1315       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1316   } while (0)
1317
1318 /* Produce designation and invocation codes at a place pointed by DST
1319    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1320    Return new DST.  */
1321
1322 unsigned char *
1323 encode_invocation_designation (charset, coding, dst)
1324      int charset;
1325      struct coding_system *coding;
1326      unsigned char *dst;
1327 {
1328   int reg;                      /* graphic register number */
1329
1330   /* At first, check designations.  */
1331   for (reg = 0; reg < 4; reg++)
1332     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1333       break;
1334
1335   if (reg >= 4)
1336     {
1337       /* CHARSET is not yet designated to any graphic registers.  */
1338       /* At first check the requested designation.  */
1339       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1340       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1341         /* Since CHARSET requests no special designation, designate it
1342            to graphic register 0.  */
1343         reg = 0;
1344
1345       ENCODE_DESIGNATION (charset, reg, coding);
1346     }
1347
1348   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1349       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1350     {
1351       /* Since the graphic register REG is not invoked to any graphic
1352          planes, invoke it to graphic plane 0.  */
1353       switch (reg)
1354         {
1355         case 0:                 /* graphic register 0 */
1356           ENCODE_SHIFT_IN;
1357           break;
1358
1359         case 1:                 /* graphic register 1 */
1360           ENCODE_SHIFT_OUT;
1361           break;
1362
1363         case 2:                 /* graphic register 2 */
1364           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1365             ENCODE_SINGLE_SHIFT_2;
1366           else
1367             ENCODE_LOCKING_SHIFT_2;
1368           break;
1369
1370         case 3:                 /* graphic register 3 */
1371           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1372             ENCODE_SINGLE_SHIFT_3;
1373           else
1374             ENCODE_LOCKING_SHIFT_3;
1375           break;
1376         }
1377     }
1378   return dst;
1379 }
1380
1381 /* The following two macros produce codes for indicating composition.  */
1382 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1383 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1384 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1385
1386 /* The following three macros produce codes for indicating direction
1387    of text.  */
1388 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1389   do {                                                  \
1390     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1391       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1392     else                                                \
1393       *dst++ = ISO_CODE_CSI;                            \
1394   } while (0)
1395
1396 #define ENCODE_DIRECTION_R2L    \
1397   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1398
1399 #define ENCODE_DIRECTION_L2R    \
1400   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1401
1402 /* Produce codes for designation and invocation to reset the graphic
1403    planes and registers to initial state.  */
1404 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1405   do {                                                                      \
1406     int reg;                                                                \
1407     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1408       ENCODE_SHIFT_IN;                                                      \
1409     for (reg = 0; reg < 4; reg++)                                           \
1410       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1411           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1412               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1413         ENCODE_DESIGNATION                                                  \
1414           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1415   } while (0)
1416
1417 /* Produce designation sequences of charsets in the line started from
1418    *SRC to a place pointed by DSTP.
1419
1420    If the current block ends before any end-of-line, we may fail to
1421    find all the necessary *designations.  */
1422 encode_designation_at_bol (coding, table, src, src_end, dstp)
1423      struct coding_system *coding;
1424      Lisp_Object table;
1425      unsigned char *src, *src_end, **dstp;
1426 {
1427   int charset, c, found = 0, reg;
1428   /* Table of charsets to be designated to each graphic register.  */
1429   int r[4];
1430   unsigned char *dst = *dstp;
1431
1432   for (reg = 0; reg < 4; reg++)
1433     r[reg] = -1;
1434
1435   while (src < src_end && *src != '\n' && found < 4)
1436     {
1437       int bytes = BYTES_BY_CHAR_HEAD (*src);
1438
1439       if (NILP (table))
1440         charset = CHARSET_AT (src);
1441       else
1442         {
1443           int c_alt, c1, c2;
1444
1445           SPLIT_STRING(src, bytes, charset, c1, c2);
1446           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1447             charset = CHAR_CHARSET (c_alt);
1448         }
1449
1450       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1451       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1452         {
1453           found++;
1454           r[reg] = charset;
1455         }
1456
1457       src += bytes;
1458     }
1459
1460   if (found)
1461     {
1462       for (reg = 0; reg < 4; reg++)
1463         if (r[reg] >= 0
1464             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1465           ENCODE_DESIGNATION (r[reg], reg, coding);
1466       *dstp = dst;
1467     }
1468 }
1469
1470 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1471
1472 int
1473 encode_coding_iso2022 (coding, source, destination,
1474                        src_bytes, dst_bytes, consumed)
1475      struct coding_system *coding;
1476      unsigned char *source, *destination;
1477      int src_bytes, dst_bytes;
1478      int *consumed;
1479 {
1480   unsigned char *src = source;
1481   unsigned char *src_end = source + src_bytes;
1482   unsigned char *dst = destination;
1483   unsigned char *dst_end = destination + dst_bytes;
1484   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1485      from DST_END to assure overflow checking is necessary only at the
1486      head of loop.  */
1487   unsigned char *adjusted_dst_end = dst_end - 19;
1488   Lisp_Object unification_table
1489       = coding->character_unification_table_for_encode;
1490
1491   if (!NILP (Venable_character_unification) && NILP (unification_table))
1492     unification_table = Vstandard_character_unification_table_for_encode;
1493
1494   while (src < src_end && dst < adjusted_dst_end)
1495     {
1496       /* SRC_BASE remembers the start position in source in each loop.
1497          The loop will be exited when there's not enough source text
1498          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1499          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1500          reset to SRC_BASE before exiting.  */
1501       unsigned char *src_base = src;
1502       int charset, c1, c2, c3, c4;
1503
1504       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1505           && CODING_SPEC_ISO_BOL (coding))
1506         {
1507           /* We have to produce designation sequences if any now.  */
1508           encode_designation_at_bol (coding, unification_table,
1509                                      src, src_end, &dst);
1510           CODING_SPEC_ISO_BOL (coding) = 0;
1511         }
1512
1513       c1 = *src++;
1514       /* If we are seeing a component of a composite character, we are
1515          seeing a leading-code specially encoded for composition, or a
1516          composition rule if composing with rule.  We must set C1
1517          to a normal leading-code or an ASCII code.  If we are not at
1518          a composed character, we must reset the composition state.  */
1519       if (COMPOSING_P (coding->composing))
1520         {
1521           if (c1 < 0xA0)
1522             {
1523               /* We are not in a composite character any longer.  */
1524               coding->composing = COMPOSING_NO;
1525               ENCODE_COMPOSITION_END;
1526             }
1527           else
1528             {
1529               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1530                 {
1531                   *dst++ = c1 & 0x7F;
1532                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1533                   continue;
1534                 }
1535               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1536                 coding->composing = COMPOSING_WITH_RULE_RULE;
1537               if (c1 == 0xA0)
1538                 {
1539                   /* This is an ASCII component.  */
1540                   ONE_MORE_BYTE (c1);
1541                   c1 &= 0x7F;
1542                 }
1543               else
1544                 /* This is a leading-code of non ASCII component.  */
1545                 c1 -= 0x20;
1546             }
1547         }
1548
1549       /* Now encode one character.  C1 is a control character, an
1550          ASCII character, or a leading-code of multi-byte character.  */
1551       switch (emacs_code_class[c1])
1552         {
1553         case EMACS_ascii_code:
1554           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1555           break;
1556
1557         case EMACS_control_code:
1558           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1559             ENCODE_RESET_PLANE_AND_REGISTER;
1560           *dst++ = c1;
1561           break;
1562
1563         case EMACS_carriage_return_code:
1564           if (!coding->selective)
1565             {
1566               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1567                 ENCODE_RESET_PLANE_AND_REGISTER;
1568               *dst++ = c1;
1569               break;
1570             }
1571           /* fall down to treat '\r' as '\n' ...  */
1572
1573         case EMACS_linefeed_code:
1574           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1575             ENCODE_RESET_PLANE_AND_REGISTER;
1576           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1577             bcopy (coding->spec.iso2022.initial_designation,
1578                    coding->spec.iso2022.current_designation,
1579                    sizeof coding->spec.iso2022.initial_designation);
1580           if (coding->eol_type == CODING_EOL_LF
1581               || coding->eol_type == CODING_EOL_UNDECIDED)
1582             *dst++ = ISO_CODE_LF;
1583           else if (coding->eol_type == CODING_EOL_CRLF)
1584             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1585           else
1586             *dst++ = ISO_CODE_CR;
1587           CODING_SPEC_ISO_BOL (coding) = 1;
1588           break;
1589
1590         case EMACS_leading_code_2:
1591           ONE_MORE_BYTE (c2);
1592           if (c2 < 0xA0)
1593             {
1594               /* invalid sequence */
1595               *dst++ = c1;
1596               *dst++ = c2;
1597             }
1598           else
1599             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1600           break;
1601
1602         case EMACS_leading_code_3:
1603           TWO_MORE_BYTES (c2, c3);
1604           if (c2 < 0xA0 || c3 < 0xA0)
1605             {
1606               /* invalid sequence */
1607               *dst++ = c1;
1608               *dst++ = c2;
1609               *dst++ = c3;
1610             }
1611           else if (c1 < LEADING_CODE_PRIVATE_11)
1612             ENCODE_ISO_CHARACTER (c1, c2, c3);
1613           else
1614             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1615           break;
1616
1617         case EMACS_leading_code_4:
1618           THREE_MORE_BYTES (c2, c3, c4);
1619           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1620             {
1621               /* invalid sequence */
1622               *dst++ = c1;
1623               *dst++ = c2;
1624               *dst++ = c3;
1625               *dst++ = c4;
1626             }
1627           else
1628             ENCODE_ISO_CHARACTER (c2, c3, c4);
1629           break;
1630
1631         case EMACS_leading_code_composition:
1632           ONE_MORE_BYTE (c2);
1633           if (c2 < 0xA0)
1634             {
1635               /* invalid sequence */
1636               *dst++ = c1;
1637               *dst++ = c2;
1638             }
1639           else if (c2 == 0xFF)
1640             {
1641               coding->composing = COMPOSING_WITH_RULE_HEAD;
1642               ENCODE_COMPOSITION_WITH_RULE_START;
1643             }
1644           else
1645             {
1646               /* Rewind one byte because it is a character code of
1647                  composition elements.  */
1648               src--;
1649               coding->composing = COMPOSING_NO_RULE_HEAD;
1650               ENCODE_COMPOSITION_NO_RULE_START;
1651             }
1652           break;
1653
1654         case EMACS_invalid_code:
1655           *dst++ = c1;
1656           break;
1657         }
1658       continue;
1659     label_end_of_loop:
1660       /* We reach here because the source date ends not at character
1661          boundary.  */
1662       coding->carryover_size = src_end - src_base;
1663       bcopy (src_base, coding->carryover, coding->carryover_size);
1664       src = src_end;
1665       break;
1666     }
1667
1668   /* If this is the last block of the text to be encoded, we must
1669      reset graphic planes and registers to the initial state.  */
1670   if (src >= src_end && coding->last_block)
1671     {
1672       ENCODE_RESET_PLANE_AND_REGISTER;
1673       if (coding->carryover_size > 0
1674           && coding->carryover_size < (dst_end - dst))
1675         {
1676           bcopy (coding->carryover, dst, coding->carryover_size);
1677           dst += coding->carryover_size;
1678           coding->carryover_size = 0;
1679         }
1680     }
1681   *consumed = src - source;
1682   return dst - destination;
1683 }
1684
1685 \f
1686 /*** 4. SJIS and BIG5 handlers ***/
1687
1688 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1689    quite widely.  So, for the moment, Emacs supports them in the bare
1690    C code.  But, in the future, they may be supported only by CCL.  */
1691
1692 /* SJIS is a coding system encoding three character sets: ASCII, right
1693    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1694    as is.  A character of charset katakana-jisx0201 is encoded by
1695    "position-code + 0x80".  A character of charset japanese-jisx0208
1696    is encoded in 2-byte but two position-codes are divided and shifted
1697    so that it fit in the range below.
1698
1699    --- CODE RANGE of SJIS ---
1700    (character set)      (range)
1701    ASCII                0x00 .. 0x7F
1702    KATAKANA-JISX0201    0xA0 .. 0xDF
1703    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1704             (2nd byte)  0x40 .. 0xFF
1705    -------------------------------
1706
1707 */
1708
1709 /* BIG5 is a coding system encoding two character sets: ASCII and
1710    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1711    character set and is encoded in two-byte.
1712
1713    --- CODE RANGE of BIG5 ---
1714    (character set)      (range)
1715    ASCII                0x00 .. 0x7F
1716    Big5 (1st byte)      0xA1 .. 0xFE
1717         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1718    --------------------------
1719
1720    Since the number of characters in Big5 is larger than maximum
1721    characters in Emacs' charset (96x96), it can't be handled as one
1722    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1723    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1724    contains frequently used characters and the latter contains less
1725    frequently used characters.  */
1726
1727 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1728    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1729    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1730    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1731
1732 /* Number of Big5 characters which have the same code in 1st byte.  */
1733 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1734
1735 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1736   do {                                                                  \
1737     unsigned int temp                                                   \
1738       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1739     if (b1 < 0xC9)                                                      \
1740       charset = charset_big5_1;                                         \
1741     else                                                                \
1742       {                                                                 \
1743         charset = charset_big5_2;                                       \
1744         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1745       }                                                                 \
1746     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1747     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1748   } while (0)
1749
1750 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1751   do {                                                                  \
1752     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1753     if (charset == charset_big5_2)                                      \
1754       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1755     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1756     b2 = temp % BIG5_SAME_ROW;                                          \
1757     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1758   } while (0)
1759
1760 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1761   do {                                                                  \
1762     int c_alt, charset_alt = (charset);                                 \
1763     if (!NILP (unification_table)                                       \
1764         && ((c_alt = unify_char (unification_table,                     \
1765                                  -1, (charset), c1, c2)) >= 0))         \
1766           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1767     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1768       DECODE_CHARACTER_ASCII (c1);                                      \
1769     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1770       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1771     else                                                                \
1772       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1773   } while (0)
1774
1775 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1776   do {                                                                    \
1777     int c_alt, charset_alt;                                               \
1778     if (!NILP (unification_table)                                         \
1779         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1780             >= 0))                                                        \
1781       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1782     else                                                                  \
1783       charset_alt = charset;                                              \
1784     if (charset_alt == charset_ascii)                                     \
1785       *dst++ = c1;                                                        \
1786     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1787       {                                                                   \
1788         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1789           *dst++ = c1;                                                    \
1790         else                                                              \
1791           *dst++ = charset_alt, *dst++ = c1;                              \
1792       }                                                                   \
1793     else                                                                  \
1794       {                                                                   \
1795         c1 &= 0x7F, c2 &= 0x7F;                                           \
1796         if (sjis_p && charset_alt == charset_jisx0208)                    \
1797           {                                                               \
1798             unsigned char s1, s2;                                         \
1799                                                                           \
1800             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1801             *dst++ = s1, *dst++ = s2;                                     \
1802           }                                                               \
1803         else if (!sjis_p                                                  \
1804                  && (charset_alt == charset_big5_1                        \
1805                      || charset_alt == charset_big5_2))                   \
1806           {                                                               \
1807             unsigned char b1, b2;                                         \
1808                                                                           \
1809             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1810             *dst++ = b1, *dst++ = b2;                                     \
1811           }                                                               \
1812         else                                                              \
1813           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1814       }                                                                   \
1815   } while (0);
1816
1817 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1818    Check if a text is encoded in SJIS.  If it is, return
1819    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1820
1821 int
1822 detect_coding_sjis (src, src_end)
1823      unsigned char *src, *src_end;
1824 {
1825   unsigned char c;
1826
1827   while (src < src_end)
1828     {
1829       c = *src++;
1830       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1831         return 0;
1832       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1833         {
1834           if (src < src_end && *src++ < 0x40)
1835             return 0;
1836         }
1837     }
1838   return CODING_CATEGORY_MASK_SJIS;
1839 }
1840
1841 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1842    Check if a text is encoded in BIG5.  If it is, return
1843    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1844
1845 int
1846 detect_coding_big5 (src, src_end)
1847      unsigned char *src, *src_end;
1848 {
1849   unsigned char c;
1850
1851   while (src < src_end)
1852     {
1853       c = *src++;
1854       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1855         return 0;
1856       if (c >= 0xA1)
1857         {
1858           if (src >= src_end)
1859             break;
1860           c = *src++;
1861           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1862             return 0;
1863         }
1864     }
1865   return CODING_CATEGORY_MASK_BIG5;
1866 }
1867
1868 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1869    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1870
1871 int
1872 decode_coding_sjis_big5 (coding, source, destination,
1873                          src_bytes, dst_bytes, consumed, sjis_p)
1874      struct coding_system *coding;
1875      unsigned char *source, *destination;
1876      int src_bytes, dst_bytes;
1877      int *consumed;
1878      int sjis_p;
1879 {
1880   unsigned char *src = source;
1881   unsigned char *src_end = source + src_bytes;
1882   unsigned char *dst = destination;
1883   unsigned char *dst_end = destination + dst_bytes;
1884   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1885      from DST_END to assure overflow checking is necessary only at the
1886      head of loop.  */
1887   unsigned char *adjusted_dst_end = dst_end - 3;
1888   Lisp_Object unification_table
1889       = coding->character_unification_table_for_decode;
1890
1891   if (!NILP (Venable_character_unification) && NILP (unification_table))
1892     unification_table = Vstandard_character_unification_table_for_decode;
1893
1894   while (src < src_end && dst < adjusted_dst_end)
1895     {
1896       /* SRC_BASE remembers the start position in source in each loop.
1897          The loop will be exited when there's not enough source text
1898          to analyze two-byte character (within macro ONE_MORE_BYTE).
1899          In that case, SRC is reset to SRC_BASE before exiting.  */
1900       unsigned char *src_base = src;
1901       unsigned char c1 = *src++, c2, c3, c4;
1902
1903       if (c1 == '\r')
1904         {
1905           if (coding->eol_type == CODING_EOL_CRLF)
1906             {
1907               ONE_MORE_BYTE (c2);
1908               if (c2 == '\n')
1909                 *dst++ = c2;
1910               else
1911                 /* To process C2 again, SRC is subtracted by 1.  */
1912                 *dst++ = c1, src--;
1913             }
1914           else
1915             *dst++ = c1;
1916         }
1917       else if (c1 < 0x20)
1918         *dst++ = c1;
1919       else if (c1 < 0x80)
1920         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1921       else if (c1 < 0xA0 || c1 >= 0xE0)
1922         {
1923           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1924           if (sjis_p)
1925             {
1926               ONE_MORE_BYTE (c2);
1927               DECODE_SJIS (c1, c2, c3, c4);
1928               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1929             }
1930           else if (c1 >= 0xE0 && c1 < 0xFF)
1931             {
1932               int charset;
1933
1934               ONE_MORE_BYTE (c2);
1935               DECODE_BIG5 (c1, c2, charset, c3, c4);
1936               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1937             }
1938           else                  /* Invalid code */
1939             *dst++ = c1;
1940         }
1941       else
1942         {
1943           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1944           if (sjis_p)
1945             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1946           else
1947             {
1948               int charset;
1949
1950               ONE_MORE_BYTE (c2);
1951               DECODE_BIG5 (c1, c2, charset, c3, c4);
1952               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1953             }
1954         }
1955       continue;
1956
1957     label_end_of_loop:
1958       coding->carryover_size = src - src_base;
1959       bcopy (src_base, coding->carryover, coding->carryover_size);
1960       src = src_base;
1961       break;
1962     }
1963
1964   *consumed = src - source;
1965   return dst - destination;
1966 }
1967
1968 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1969    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1970    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1971    sure that all these charsets are registered as official charset
1972    (i.e. do not have extended leading-codes).  Characters of other
1973    charsets are produced without any encoding.  If SJIS_P is 1, encode
1974    SJIS text, else encode BIG5 text.  */
1975
1976 int
1977 encode_coding_sjis_big5 (coding, source, destination,
1978                          src_bytes, dst_bytes, consumed, sjis_p)
1979      struct coding_system *coding;
1980      unsigned char *source, *destination;
1981      int src_bytes, dst_bytes;
1982      int *consumed;
1983      int sjis_p;
1984 {
1985   unsigned char *src = source;
1986   unsigned char *src_end = source + src_bytes;
1987   unsigned char *dst = destination;
1988   unsigned char *dst_end = destination + dst_bytes;
1989   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1990      from DST_END to assure overflow checking is necessary only at the
1991      head of loop.  */
1992   unsigned char *adjusted_dst_end = dst_end - 1;
1993   Lisp_Object unification_table
1994       = coding->character_unification_table_for_encode;
1995
1996   if (!NILP (Venable_character_unification) && NILP (unification_table))
1997     unification_table = Vstandard_character_unification_table_for_encode;
1998
1999   while (src < src_end && dst < adjusted_dst_end)
2000     {
2001       /* SRC_BASE remembers the start position in source in each loop.
2002          The loop will be exited when there's not enough source text
2003          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2004          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2005          before exiting.  */
2006       unsigned char *src_base = src;
2007       unsigned char c1 = *src++, c2, c3, c4;
2008
2009       if (coding->composing)
2010         {
2011           if (c1 == 0xA0)
2012             {
2013               ONE_MORE_BYTE (c1);
2014               c1 &= 0x7F;
2015             }
2016           else if (c1 >= 0xA0)
2017             c1 -= 0x20;
2018           else
2019             coding->composing = 0;
2020         }
2021
2022       switch (emacs_code_class[c1])
2023         {
2024         case EMACS_ascii_code:
2025           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2026           break;
2027
2028         case EMACS_control_code:
2029           *dst++ = c1;
2030           break;
2031
2032         case EMACS_carriage_return_code:
2033           if (!coding->selective)
2034             {
2035               *dst++ = c1;
2036               break;
2037             }
2038           /* fall down to treat '\r' as '\n' ...  */
2039
2040         case EMACS_linefeed_code:
2041           if (coding->eol_type == CODING_EOL_LF
2042               || coding->eol_type == CODING_EOL_UNDECIDED)
2043             *dst++ = '\n';
2044           else if (coding->eol_type == CODING_EOL_CRLF)
2045             *dst++ = '\r', *dst++ = '\n';
2046           else
2047             *dst++ = '\r';
2048           break;
2049
2050         case EMACS_leading_code_2:
2051           ONE_MORE_BYTE (c2);
2052           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2053           break;
2054
2055         case EMACS_leading_code_3:
2056           TWO_MORE_BYTES (c2, c3);
2057           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2058           break;
2059
2060         case EMACS_leading_code_4:
2061           THREE_MORE_BYTES (c2, c3, c4);
2062           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2063           break;
2064
2065         case EMACS_leading_code_composition:
2066           coding->composing = 1;
2067           break;
2068
2069         default:                /* i.e. case EMACS_invalid_code: */
2070           *dst++ = c1;
2071         }
2072       continue;
2073
2074     label_end_of_loop:
2075       coding->carryover_size = src_end - src_base;
2076       bcopy (src_base, coding->carryover, coding->carryover_size);
2077       src = src_end;
2078       break;
2079     }
2080
2081   *consumed = src - source;
2082   return dst - destination;
2083 }
2084
2085 \f
2086 /*** 5. End-of-line handlers ***/
2087
2088 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2089    This function is called only when `coding->eol_type' is
2090    CODING_EOL_CRLF or CODING_EOL_CR.  */
2091
2092 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2093      struct coding_system *coding;
2094      unsigned char *source, *destination;
2095      int src_bytes, dst_bytes;
2096      int *consumed;
2097 {
2098   unsigned char *src = source;
2099   unsigned char *src_end = source + src_bytes;
2100   unsigned char *dst = destination;
2101   unsigned char *dst_end = destination + dst_bytes;
2102   int produced;
2103
2104   switch (coding->eol_type)
2105     {
2106     case CODING_EOL_CRLF:
2107       {
2108         /* Since the maximum bytes produced by each loop is 2, we
2109            subtract 1 from DST_END to assure overflow checking is
2110            necessary only at the head of loop.  */
2111         unsigned char *adjusted_dst_end = dst_end - 1;
2112
2113         while (src < src_end && dst < adjusted_dst_end)
2114           {
2115             unsigned char *src_base = src;
2116             unsigned char c = *src++;
2117             if (c == '\r')
2118               {
2119                 ONE_MORE_BYTE (c);
2120                 if (c != '\n')
2121                   *dst++ = '\r';
2122                 *dst++ = c;
2123               }
2124             else
2125               *dst++ = c;
2126             continue;
2127
2128           label_end_of_loop:
2129             coding->carryover_size = src - src_base;
2130             bcopy (src_base, coding->carryover, coding->carryover_size);
2131             src = src_base;
2132             break;
2133           }
2134         *consumed = src - source;
2135         produced = dst - destination;
2136         break;
2137       }
2138
2139     case CODING_EOL_CR:
2140       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2141       bcopy (source, destination, produced);
2142       dst_end = destination + produced;
2143       while (dst < dst_end)
2144         if (*dst++ == '\r') dst[-1] = '\n';
2145       *consumed = produced;
2146       break;
2147
2148     default:                    /* i.e. case: CODING_EOL_LF */
2149       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2150       bcopy (source, destination, produced);
2151       *consumed = produced;
2152       break;
2153     }
2154
2155   return produced;
2156 }
2157
2158 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2159    format of end-of-line according to `coding->eol_type'.  If
2160    `coding->selective' is 1, code '\r' in source text also means
2161    end-of-line.  */
2162
2163 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2164      struct coding_system *coding;
2165      unsigned char *source, *destination;
2166      int src_bytes, dst_bytes;
2167      int *consumed;
2168 {
2169   unsigned char *src = source;
2170   unsigned char *dst = destination;
2171   int produced;
2172
2173   if (src_bytes <= 0)
2174     return 0;
2175
2176   switch (coding->eol_type)
2177     {
2178     case CODING_EOL_LF:
2179     case CODING_EOL_UNDECIDED:
2180       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2181       bcopy (source, destination, produced);
2182       if (coding->selective)
2183         {
2184           int i = produced;
2185           while (i--)
2186             if (*dst++ == '\r') dst[-1] = '\n';
2187         }
2188       *consumed = produced;
2189
2190     case CODING_EOL_CRLF:
2191       {
2192         unsigned char c;
2193         unsigned char *src_end = source + src_bytes;
2194         unsigned char *dst_end = destination + dst_bytes;
2195         /* Since the maximum bytes produced by each loop is 2, we
2196            subtract 1 from DST_END to assure overflow checking is
2197            necessary only at the head of loop.  */
2198         unsigned char *adjusted_dst_end = dst_end - 1;
2199
2200         while (src < src_end && dst < adjusted_dst_end)
2201           {
2202             c = *src++;
2203             if (c == '\n' || (c == '\r' && coding->selective))
2204               *dst++ = '\r', *dst++ = '\n';
2205             else
2206               *dst++ = c;
2207           }
2208         produced = dst - destination;
2209         *consumed = src - source;
2210         break;
2211       }
2212
2213     default:                    /* i.e. case CODING_EOL_CR: */
2214       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2215       bcopy (source, destination, produced);
2216       {
2217         int i = produced;
2218         while (i--)
2219           if (*dst++ == '\n') dst[-1] = '\r';
2220       }
2221       *consumed = produced;
2222     }
2223
2224   return produced;
2225 }
2226
2227 \f
2228 /*** 6. C library functions ***/
2229
2230 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2231    has a property `coding-system'.  The value of this property is a
2232    vector of length 5 (called as coding-vector).  Among elements of
2233    this vector, the first (element[0]) and the fifth (element[4])
2234    carry important information for decoding/encoding.  Before
2235    decoding/encoding, this information should be set in fields of a
2236    structure of type `coding_system'.
2237
2238    A value of property `coding-system' can be a symbol of another
2239    subsidiary coding-system.  In that case, Emacs gets coding-vector
2240    from that symbol.
2241
2242    `element[0]' contains information to be set in `coding->type'.  The
2243    value and its meaning is as follows:
2244
2245    0 -- coding_type_emacs_mule
2246    1 -- coding_type_sjis
2247    2 -- coding_type_iso2022
2248    3 -- coding_type_big5
2249    4 -- coding_type_ccl encoder/decoder written in CCL
2250    nil -- coding_type_no_conversion
2251    t -- coding_type_undecided (automatic conversion on decoding,
2252                                no-conversion on encoding)
2253
2254    `element[4]' contains information to be set in `coding->flags' and
2255    `coding->spec'.  The meaning varies by `coding->type'.
2256
2257    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2258    of length 32 (of which the first 13 sub-elements are used now).
2259    Meanings of these sub-elements are:
2260
2261    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2262         If the value is an integer of valid charset, the charset is
2263         assumed to be designated to graphic register N initially.
2264
2265         If the value is minus, it is a minus value of charset which
2266         reserves graphic register N, which means that the charset is
2267         not designated initially but should be designated to graphic
2268         register N just before encoding a character in that charset.
2269
2270         If the value is nil, graphic register N is never used on
2271         encoding.
2272
2273    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2274         Each value takes t or nil.  See the section ISO2022 of
2275         `coding.h' for more information.
2276
2277    If `coding->type' is `coding_type_big5', element[4] is t to denote
2278    BIG5-ETen or nil to denote BIG5-HKU.
2279
2280    If `coding->type' takes the other value, element[4] is ignored.
2281
2282    Emacs Lisp's coding system also carries information about format of
2283    end-of-line in a value of property `eol-type'.  If the value is
2284    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2285    means CODING_EOL_CR.  If it is not integer, it should be a vector
2286    of subsidiary coding systems of which property `eol-type' has one
2287    of above values.
2288
2289 */
2290
2291 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2292    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2293    is setup so that no conversion is necessary and return -1, else
2294    return 0.  */
2295
2296 int
2297 setup_coding_system (coding_system, coding)
2298      Lisp_Object coding_system;
2299      struct coding_system *coding;
2300 {
2301   Lisp_Object type, eol_type;
2302
2303   /* At first, set several fields to default values.  */
2304   coding->require_flushing = 0;
2305   coding->last_block = 0;
2306   coding->selective = 0;
2307   coding->composing = 0;
2308   coding->direction = 0;
2309   coding->carryover_size = 0;
2310   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2311   coding->character_unification_table_for_decode = Qnil;
2312   coding->character_unification_table_for_encode = Qnil;
2313
2314   Vlast_coding_system_used = coding->symbol = coding_system;
2315   eol_type = Qnil;
2316   /* Get value of property `coding-system' until we get a vector.
2317      While doing that, also get values of properties
2318      `post-read-conversion', `pre-write-conversion',
2319      `character-unification-table-for-decode',
2320      `character-unification-table-for-encode' and `eol-type'.  */
2321   while (!NILP (coding_system) && SYMBOLP (coding_system))
2322     {
2323       if (NILP (coding->post_read_conversion))
2324         coding->post_read_conversion = Fget (coding_system,
2325                                              Qpost_read_conversion);
2326       if (NILP (coding->pre_write_conversion))
2327         coding->pre_write_conversion = Fget (coding_system,
2328                                              Qpre_write_conversion);
2329       if (!inhibit_eol_conversion && NILP (eol_type))
2330         eol_type = Fget (coding_system, Qeol_type);
2331
2332       if (NILP (coding->character_unification_table_for_decode))
2333         coding->character_unification_table_for_decode
2334           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2335
2336       if (NILP (coding->character_unification_table_for_encode))
2337         coding->character_unification_table_for_encode
2338           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2339
2340       coding_system = Fget (coding_system, Qcoding_system);
2341     }
2342
2343   while (!NILP (coding->character_unification_table_for_decode)
2344          && SYMBOLP (coding->character_unification_table_for_decode))
2345         coding->character_unification_table_for_decode
2346           = Fget (coding->character_unification_table_for_decode,
2347                   Qcharacter_unification_table_for_decode);
2348   if (!NILP (coding->character_unification_table_for_decode)
2349       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2350       coding->character_unification_table_for_decode = Qnil;
2351
2352   while (!NILP (coding->character_unification_table_for_encode)
2353          && SYMBOLP (coding->character_unification_table_for_encode))
2354         coding->character_unification_table_for_encode
2355           = Fget (coding->character_unification_table_for_encode,
2356                   Qcharacter_unification_table_for_encode);
2357   if (!NILP (coding->character_unification_table_for_encode)
2358       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2359       coding->character_unification_table_for_encode = Qnil;
2360
2361   if (!VECTORP (coding_system)
2362       || XVECTOR (coding_system)->size != 5)
2363     goto label_invalid_coding_system;
2364
2365   if (VECTORP (eol_type))
2366     coding->eol_type = CODING_EOL_UNDECIDED;
2367   else if (XFASTINT (eol_type) == 1)
2368     coding->eol_type = CODING_EOL_CRLF;
2369   else if (XFASTINT (eol_type) == 2)
2370     coding->eol_type = CODING_EOL_CR;
2371   else
2372     coding->eol_type = CODING_EOL_LF;
2373
2374   type = XVECTOR (coding_system)->contents[0];
2375   switch (XFASTINT (type))
2376     {
2377     case 0:
2378       coding->type = coding_type_emacs_mule;
2379       break;
2380
2381     case 1:
2382       coding->type = coding_type_sjis;
2383       break;
2384
2385     case 2:
2386       coding->type = coding_type_iso2022;
2387       {
2388         Lisp_Object val;
2389         Lisp_Object *flags;
2390         int i, charset, default_reg_bits = 0;
2391
2392         val = XVECTOR (coding_system)->contents[4];
2393
2394         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2395           goto label_invalid_coding_system;
2396
2397         flags = XVECTOR (val)->contents;
2398         coding->flags
2399           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2400              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2401              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2402              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2403              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2404              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2405              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2406              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2407              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2408              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2409              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2410              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2411              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2412              );
2413
2414         /* Invoke graphic register 0 to plane 0.  */
2415         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2416         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2417         CODING_SPEC_ISO_INVOCATION (coding, 1)
2418           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2419         /* Not single shifting at first.  */
2420         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2421         /* Beginning of buffer should also be regarded as bol. */
2422         CODING_SPEC_ISO_BOL (coding) = 1;
2423
2424         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2425            FLAGS[REG] can be one of below:
2426                 integer CHARSET: CHARSET occupies register I,
2427                 t: designate nothing to REG initially, but can be used
2428                   by any charsets,
2429                 list of integer, nil, or t: designate the first
2430                   element (if integer) to REG initially, the remaining
2431                   elements (if integer) is designated to REG on request,
2432                   if an element is t, REG can be used by any charset,
2433                 nil: REG is never used.  */
2434         for (charset = 0; charset <= MAX_CHARSET; charset++)
2435           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2436             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2437         bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
2438         for (i = 0; i < 4; i++)
2439           {
2440             if (INTEGERP (flags[i])
2441                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2442                 || (charset = get_charset_id (flags[i])) >= 0)
2443               {
2444                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2445                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2446                 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2447               }
2448             else if (EQ (flags[i], Qt))
2449               {
2450                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2451                 default_reg_bits |= 1 << i;
2452               }
2453             else if (CONSP (flags[i]))
2454               {
2455                 Lisp_Object tail = flags[i];
2456
2457                 if (INTEGERP (XCONS (tail)->car)
2458                     && (charset = XINT (XCONS (tail)->car),
2459                         CHARSET_VALID_P (charset))
2460                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2461                   {
2462                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2463                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2464                     CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2465                   }
2466                 else
2467                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2468                 tail = XCONS (tail)->cdr;
2469                 while (CONSP (tail))
2470                   {
2471                     if (INTEGERP (XCONS (tail)->car)
2472                         && (charset = XINT (XCONS (tail)->car),
2473                             CHARSET_VALID_P (charset))
2474                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2475                       {
2476                         CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2477                           = i;
2478                         CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2479                           = 1;
2480                       }
2481                     else if (EQ (XCONS (tail)->car, Qt))
2482                       default_reg_bits |= 1 << i;
2483                     tail = XCONS (tail)->cdr;
2484                   }
2485               }
2486             else
2487               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2488
2489             CODING_SPEC_ISO_DESIGNATION (coding, i)
2490               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2491           }
2492
2493         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2494           {
2495             /* REG 1 can be used only by locking shift in 7-bit env.  */
2496             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2497               default_reg_bits &= ~2;
2498             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2499               /* Without any shifting, only REG 0 and 1 can be used.  */
2500               default_reg_bits &= 3;
2501           }
2502
2503         for (charset = 0; charset <= MAX_CHARSET; charset++)
2504           if (CHARSET_VALID_P (charset)
2505               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2506                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2507             {
2508               /* We have not yet decided where to designate CHARSET.  */
2509               int reg_bits = default_reg_bits;
2510
2511               if (CHARSET_CHARS (charset) == 96)
2512                 /* A charset of CHARS96 can't be designated to REG 0.  */
2513                 reg_bits &= ~1;
2514
2515               if (reg_bits)
2516                 /* There exist some default graphic register.  */
2517                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2518                   = (reg_bits & 1
2519                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2520               else
2521                 /* We anyway have to designate CHARSET to somewhere.  */
2522                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2523                   = (CHARSET_CHARS (charset) == 94
2524                      ? 0
2525                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2526                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2527                         ? 1
2528                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2529                            ? 2 : 0)));
2530             }
2531       }
2532       coding->require_flushing = 1;
2533       break;
2534
2535     case 3:
2536       coding->type = coding_type_big5;
2537       coding->flags
2538         = (NILP (XVECTOR (coding_system)->contents[4])
2539            ? CODING_FLAG_BIG5_HKU
2540            : CODING_FLAG_BIG5_ETEN);
2541       break;
2542
2543     case 4:
2544       coding->type = coding_type_ccl;
2545       {
2546         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2547         if (CONSP  (val)
2548             && VECTORP (XCONS (val)->car)
2549             && VECTORP (XCONS (val)->cdr))
2550           {
2551             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2552             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2553           }
2554         else
2555           goto label_invalid_coding_system;
2556       }
2557       coding->require_flushing = 1;
2558       break;
2559
2560     case 5:
2561       coding->type = coding_type_raw_text;
2562       break;
2563
2564     default:
2565       if (EQ (type, Qt))
2566         coding->type = coding_type_undecided;
2567       else
2568         coding->type = coding_type_no_conversion;
2569       break;
2570     }
2571   return 0;
2572
2573  label_invalid_coding_system:
2574   coding->type = coding_type_no_conversion;
2575   coding->eol_type = CODING_EOL_LF;
2576   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2577     = Qnil;
2578   return -1;
2579 }
2580
2581 /* Emacs has a mechanism to automatically detect a coding system if it
2582    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2583    it's impossible to distinguish some coding systems accurately
2584    because they use the same range of codes.  So, at first, coding
2585    systems are categorized into 7, those are:
2586
2587    o coding-category-emacs-mule
2588
2589         The category for a coding system which has the same code range
2590         as Emacs' internal format.  Assigned the coding-system (Lisp
2591         symbol) `emacs-mule' by default.
2592
2593    o coding-category-sjis
2594
2595         The category for a coding system which has the same code range
2596         as SJIS.  Assigned the coding-system (Lisp
2597         symbol) `japanese-shift-jis' by default.
2598
2599    o coding-category-iso-7
2600
2601         The category for a coding system which has the same code range
2602         as ISO2022 of 7-bit environment.  This doesn't use any locking
2603         shift and single shift functions.  Assigned the coding-system
2604         (Lisp symbol) `iso-2022-7bit' by default.
2605
2606    o coding-category-iso-8-1
2607
2608         The category for a coding system which has the same code range
2609         as ISO2022 of 8-bit environment and graphic plane 1 used only
2610         for DIMENSION1 charset.  This doesn't use any locking shift
2611         and single shift functions.  Assigned the coding-system (Lisp
2612         symbol) `iso-latin-1' by default.
2613
2614    o coding-category-iso-8-2
2615
2616         The category for a coding system which has the same code range
2617         as ISO2022 of 8-bit environment and graphic plane 1 used only
2618         for DIMENSION2 charset.  This doesn't use any locking shift
2619         and single shift functions.  Assigned the coding-system (Lisp
2620         symbol) `japanese-iso-8bit' by default.
2621
2622    o coding-category-iso-7-else
2623
2624         The category for a coding system which has the same code range
2625         as ISO2022 of 7-bit environemnt but uses locking shift or
2626         single shift functions.  Assigned the coding-system (Lisp
2627         symbol) `iso-2022-7bit-lock' by default.
2628
2629    o coding-category-iso-8-else
2630
2631         The category for a coding system which has the same code range
2632         as ISO2022 of 8-bit environemnt but uses locking shift or
2633         single shift functions.  Assigned the coding-system (Lisp
2634         symbol) `iso-2022-8bit-ss2' by default.
2635
2636    o coding-category-big5
2637
2638         The category for a coding system which has the same code range
2639         as BIG5.  Assigned the coding-system (Lisp symbol)
2640         `cn-big5' by default.
2641
2642    o coding-category-binary
2643
2644         The category for a coding system not categorized in any of the
2645         above.  Assigned the coding-system (Lisp symbol)
2646         `no-conversion' by default.
2647
2648    Each of them is a Lisp symbol and the value is an actual
2649    `coding-system's (this is also a Lisp symbol) assigned by a user.
2650    What Emacs does actually is to detect a category of coding system.
2651    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2652    decide only one possible category, it selects a category of the
2653    highest priority.  Priorities of categories are also specified by a
2654    user in a Lisp variable `coding-category-list'.
2655
2656 */
2657
2658 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2659    If it detects possible coding systems, return an integer in which
2660    appropriate flag bits are set.  Flag bits are defined by macros
2661    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2662
2663 int
2664 detect_coding_mask (src, src_bytes)
2665      unsigned char *src;
2666      int src_bytes;
2667 {
2668   register unsigned char c;
2669   unsigned char *src_end = src + src_bytes;
2670   int mask;
2671
2672   /* At first, skip all ASCII characters and control characters except
2673      for three ISO2022 specific control characters.  */
2674  label_loop_detect_coding:
2675   while (src < src_end)
2676     {
2677       c = *src;
2678       if (c >= 0x80
2679           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2680         break;
2681       src++;
2682     }
2683
2684   if (src >= src_end)
2685     /* We found nothing other than ASCII.  There's nothing to do.  */
2686     return CODING_CATEGORY_MASK_ANY;
2687
2688   /* The text seems to be encoded in some multilingual coding system.
2689      Now, try to find in which coding system the text is encoded.  */
2690   if (c < 0x80)
2691     {
2692       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2693       /* C is an ISO2022 specific control code of C0.  */
2694       mask = detect_coding_iso2022 (src, src_end);
2695       src++;
2696       if (mask == 0)
2697         /* No valid ISO2022 code follows C.  Try again.  */
2698         goto label_loop_detect_coding;
2699       mask |= CODING_CATEGORY_MASK_RAW_TEXT;
2700     }
2701   else if (c < 0xA0)
2702     {
2703       /* If C is a special latin extra code,
2704          or is an ISO2022 specific control code of C1 (SS2 or SS3),
2705          or is an ISO2022 control-sequence-introducer (CSI),
2706          we should also consider the possibility of ISO2022 codings.  */
2707       if ((VECTORP (Vlatin_extra_code_table)
2708            && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2709           || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2710           || (c == ISO_CODE_CSI
2711               && (src < src_end
2712                   && (*src == ']'
2713                       || (src + 1 < src_end
2714                           && src[1] == ']'
2715                           && (*src == '0' || *src == '1' || *src == '2'))))))
2716         mask = (detect_coding_iso2022 (src, src_end)
2717                 | detect_coding_sjis (src, src_end)
2718                 | detect_coding_emacs_mule (src, src_end)
2719                 | CODING_CATEGORY_MASK_RAW_TEXT);
2720
2721       else
2722         /* C is the first byte of SJIS character code,
2723            or a leading-code of Emacs' internal format (emacs-mule).  */
2724         mask = (detect_coding_sjis (src, src_end)
2725                 | detect_coding_emacs_mule (src, src_end)
2726                 | CODING_CATEGORY_MASK_RAW_TEXT);
2727     }
2728   else
2729     /* C is a character of ISO2022 in graphic plane right,
2730        or a SJIS's 1-byte character code (i.e. JISX0201),
2731        or the first byte of BIG5's 2-byte code.  */
2732     mask = (detect_coding_iso2022 (src, src_end)
2733             | detect_coding_sjis (src, src_end)
2734             | detect_coding_big5 (src, src_end)
2735             | CODING_CATEGORY_MASK_RAW_TEXT);
2736
2737   return mask;
2738 }
2739
2740 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2741    The information of the detected coding system is set in CODING.  */
2742
2743 void
2744 detect_coding (coding, src, src_bytes)
2745      struct coding_system *coding;
2746      unsigned char *src;
2747      int src_bytes;
2748 {
2749   int mask = detect_coding_mask (src, src_bytes);
2750   int idx;
2751   Lisp_Object val = Vcoding_category_list;
2752
2753   if (mask == CODING_CATEGORY_MASK_ANY)
2754     /* We found nothing other than ASCII.  There's nothing to do.  */
2755     return;
2756
2757   /* We found some plausible coding systems.  Let's use a coding
2758      system of the highest priority.  */
2759
2760   if (CONSP (val))
2761     while (!NILP (val))
2762       {
2763         idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2764         if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2765           break;
2766         val = XCONS (val)->cdr;
2767       }
2768   else
2769     val = Qnil;
2770
2771   if (NILP (val))
2772     {
2773       /* For unknown reason, `Vcoding_category_list' contains none of
2774          found categories.  Let's use any of them.  */
2775       for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2776         if (mask & (1 << idx))
2777           break;
2778     }
2779   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2780 }
2781
2782 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2783    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2784    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2785
2786 #define MAX_EOL_CHECK_COUNT 3
2787
2788 int
2789 detect_eol_type (src, src_bytes)
2790      unsigned char *src;
2791      int src_bytes;
2792 {
2793   unsigned char *src_end = src + src_bytes;
2794   unsigned char c;
2795   int total = 0;                /* How many end-of-lines are found so far.  */
2796   int eol_type = CODING_EOL_UNDECIDED;
2797   int this_eol_type;
2798
2799   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2800     {
2801       c = *src++;
2802       if (c == '\n' || c == '\r')
2803         {
2804           total++;
2805           if (c == '\n')
2806             this_eol_type = CODING_EOL_LF;
2807           else if (src >= src_end || *src != '\n')
2808             this_eol_type = CODING_EOL_CR;
2809           else
2810             this_eol_type = CODING_EOL_CRLF, src++;
2811
2812           if (eol_type == CODING_EOL_UNDECIDED)
2813             /* This is the first end-of-line.  */
2814             eol_type = this_eol_type;
2815           else if (eol_type != this_eol_type)
2816             /* The found type is different from what found before.
2817                Let's notice the caller about this inconsistency.  */
2818             return CODING_EOL_INCONSISTENT;
2819         }
2820     }
2821
2822   return eol_type;
2823 }
2824
2825 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2826    is encoded.  If it detects an appropriate format of end-of-line, it
2827    sets the information in *CODING.  */
2828
2829 void
2830 detect_eol (coding, src, src_bytes)
2831      struct coding_system *coding;
2832      unsigned char *src;
2833      int src_bytes;
2834 {
2835   Lisp_Object val, coding_system;
2836   int eol_type = detect_eol_type (src, src_bytes);
2837
2838   if (eol_type == CODING_EOL_UNDECIDED)
2839     /*  We found no end-of-line in the source text.  */
2840     return;
2841
2842   if (eol_type == CODING_EOL_INCONSISTENT)
2843     {
2844 #if 0
2845       /* This code is suppressed until we find a better way to
2846          distinguish raw text file and binary file.  */
2847
2848       /* If we have already detected that the coding is raw-text, the
2849          coding should actually be no-conversion.  */
2850       if (coding->type == coding_type_raw_text)
2851         {
2852           setup_coding_system (Qno_conversion, coding);
2853           return;
2854         }
2855       /* Else, let's decode only text code anyway.  */
2856 #endif /* 0 */
2857       eol_type = CODING_EOL_LF;
2858     }
2859
2860   coding_system = coding->symbol;
2861   while (!NILP (coding_system)
2862          && NILP (val = Fget (coding_system, Qeol_type)))
2863     coding_system = Fget (coding_system, Qcoding_system);
2864   if (VECTORP (val) && XVECTOR (val)->size == 3)
2865     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2866 }
2867
2868 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2869    decoding, it may detect coding system and format of end-of-line if
2870    those are not yet decided.  */
2871
2872 int
2873 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2874      struct coding_system *coding;
2875      unsigned char *source, *destination;
2876      int src_bytes, dst_bytes;
2877      int *consumed;
2878 {
2879   int produced;
2880
2881   if (src_bytes <= 0)
2882     {
2883       *consumed = 0;
2884       return 0;
2885     }
2886
2887   if (coding->type == coding_type_undecided)
2888     detect_coding (coding, source, src_bytes);
2889
2890   if (coding->eol_type == CODING_EOL_UNDECIDED)
2891     detect_eol (coding, source, src_bytes);
2892
2893   coding->carryover_size = 0;
2894   switch (coding->type)
2895     {
2896     case coding_type_no_conversion:
2897     label_no_conversion:
2898       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2899       bcopy (source, destination, produced);
2900       *consumed = produced;
2901       break;
2902
2903     case coding_type_emacs_mule:
2904     case coding_type_undecided:
2905     case coding_type_raw_text:
2906       if (coding->eol_type == CODING_EOL_LF
2907           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2908         goto label_no_conversion;
2909       produced = decode_eol (coding, source, destination,
2910                              src_bytes, dst_bytes, consumed);
2911       break;
2912
2913     case coding_type_sjis:
2914       produced = decode_coding_sjis_big5 (coding, source, destination,
2915                                           src_bytes, dst_bytes, consumed,
2916                                           1);
2917       break;
2918
2919     case coding_type_iso2022:
2920       produced = decode_coding_iso2022 (coding, source, destination,
2921                                         src_bytes, dst_bytes, consumed);
2922       break;
2923
2924     case coding_type_big5:
2925       produced = decode_coding_sjis_big5 (coding, source, destination,
2926                                           src_bytes, dst_bytes, consumed,
2927                                           0);
2928       break;
2929
2930     case coding_type_ccl:
2931       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2932                              src_bytes, dst_bytes, consumed);
2933       break;
2934     }
2935
2936   return produced;
2937 }
2938
2939 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2940
2941 int
2942 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2943      struct coding_system *coding;
2944      unsigned char *source, *destination;
2945      int src_bytes, dst_bytes;
2946      int *consumed;
2947 {
2948   int produced;
2949
2950   switch (coding->type)
2951     {
2952     case coding_type_no_conversion:
2953     label_no_conversion:
2954       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2955       if (produced > 0)
2956         {
2957           bcopy (source, destination, produced);
2958           if (coding->selective)
2959             {
2960               unsigned char *p = destination, *pend = destination + produced;
2961               while (p < pend)
2962                 if (*p++ == '\015') p[-1] = '\n';
2963             }
2964         }
2965       *consumed = produced;
2966       break;
2967
2968     case coding_type_emacs_mule:
2969     case coding_type_undecided:
2970     case coding_type_raw_text:
2971       if (coding->eol_type == CODING_EOL_LF
2972           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2973         goto label_no_conversion;
2974       produced = encode_eol (coding, source, destination,
2975                              src_bytes, dst_bytes, consumed);
2976       break;
2977
2978     case coding_type_sjis:
2979       produced = encode_coding_sjis_big5 (coding, source, destination,
2980                                           src_bytes, dst_bytes, consumed,
2981                                           1);
2982       break;
2983
2984     case coding_type_iso2022:
2985       produced = encode_coding_iso2022 (coding, source, destination,
2986                                         src_bytes, dst_bytes, consumed);
2987       break;
2988
2989     case coding_type_big5:
2990       produced = encode_coding_sjis_big5 (coding, source, destination,
2991                                           src_bytes, dst_bytes, consumed,
2992                                           0);
2993       break;
2994
2995     case coding_type_ccl:
2996       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2997                              src_bytes, dst_bytes, consumed);
2998       break;
2999     }
3000
3001   return produced;
3002 }
3003
3004 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3005
3006 /* Return maximum size (bytes) of a buffer enough for decoding
3007    SRC_BYTES of text encoded in CODING.  */
3008
3009 int
3010 decoding_buffer_size (coding, src_bytes)
3011      struct coding_system *coding;
3012      int src_bytes;
3013 {
3014   int magnification;
3015
3016   if (coding->type == coding_type_iso2022)
3017     magnification = 3;
3018   else if (coding->type == coding_type_ccl)
3019     magnification = coding->spec.ccl.decoder.buf_magnification;
3020   else
3021     magnification = 2;
3022
3023   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3024 }
3025
3026 /* Return maximum size (bytes) of a buffer enough for encoding
3027    SRC_BYTES of text to CODING.  */
3028
3029 int
3030 encoding_buffer_size (coding, src_bytes)
3031      struct coding_system *coding;
3032      int src_bytes;
3033 {
3034   int magnification;
3035
3036   if (coding->type == coding_type_ccl)
3037     magnification = coding->spec.ccl.encoder.buf_magnification;
3038   else
3039     magnification = 3;
3040
3041   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3042 }
3043
3044 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3045 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3046 #endif
3047
3048 char *conversion_buffer;
3049 int conversion_buffer_size;
3050
3051 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3052    or decoding.  Sufficient memory is allocated automatically.  If we
3053    run out of memory, return NULL.  */
3054
3055 char *
3056 get_conversion_buffer (size)
3057      int size;
3058 {
3059   if (size > conversion_buffer_size)
3060     {
3061       char *buf;
3062       int real_size = conversion_buffer_size * 2;
3063
3064       while (real_size < size) real_size *= 2;
3065       buf = (char *) xmalloc (real_size);
3066       xfree (conversion_buffer);
3067       conversion_buffer = buf;
3068       conversion_buffer_size = real_size;
3069     }
3070   return conversion_buffer;
3071 }
3072
3073 \f
3074 #ifdef emacs
3075 /*** 7. Emacs Lisp library functions ***/
3076
3077 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3078        1, 1, 0,
3079   "Return coding-spec of CODING-SYSTEM.\n\
3080 If CODING-SYSTEM is not a valid coding-system, return nil.")
3081   (obj)
3082      Lisp_Object obj;
3083 {
3084   while (SYMBOLP (obj) && !NILP (obj))
3085     obj = Fget (obj, Qcoding_system);
3086   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3087           ? Qnil : obj);
3088 }
3089
3090 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3091   "Return t if OBJECT is nil or a coding-system.\n\
3092 See document of make-coding-system for coding-system object.")
3093   (obj)
3094      Lisp_Object obj;
3095 {
3096   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3097 }
3098
3099 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3100        Sread_non_nil_coding_system, 1, 1, 0,
3101   "Read a coding system from the minibuffer, prompting with string PROMPT.")
3102   (prompt)
3103      Lisp_Object prompt;
3104 {
3105   Lisp_Object val;
3106   do
3107     {
3108       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3109                               Qt, Qnil, Qnil, Qnil, Qnil);
3110     }
3111   while (XSTRING (val)->size == 0);
3112   return (Fintern (val, Qnil));
3113 }
3114
3115 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3116   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3117   (prompt)
3118      Lisp_Object prompt;
3119 {
3120   Lisp_Object val;
3121   val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3122                           Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
3123   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3124 }
3125
3126 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3127        1, 1, 0,
3128   "Check validity of CODING-SYSTEM.\n\
3129 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3130 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3131 The value of property should be a vector of length 5.")
3132   (coding_system)
3133      Lisp_Object coding_system;
3134 {
3135   CHECK_SYMBOL (coding_system, 0);
3136   if (!NILP (Fcoding_system_p (coding_system)))
3137     return coding_system;
3138   while (1)
3139     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3140 }
3141
3142 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3143        2, 2, 0,
3144   "Detect coding system of the text in the region between START and END.\n\
3145 Return a list of possible coding systems ordered by priority.\n\
3146 If only ASCII characters are found, it returns `undecided'\n\
3147  or its subsidiary coding system according to a detected end-of-line format.")
3148   (b, e)
3149      Lisp_Object b, e;
3150 {
3151   int coding_mask, eol_type;
3152   Lisp_Object val;
3153   int beg, end;
3154
3155   validate_region (&b, &e);
3156   beg = XINT (b), end = XINT (e);
3157   if (beg < GPT && end >= GPT) move_gap (end);
3158
3159   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3160   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
3161
3162   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3163     {
3164       val = Qundecided;
3165       if (eol_type != CODING_EOL_UNDECIDED
3166           && eol_type != CODING_EOL_INCONSISTENT)
3167         {
3168           Lisp_Object val2;
3169           val2 = Fget (Qundecided, Qeol_type);
3170           if (VECTORP (val2))
3171             val = XVECTOR (val2)->contents[eol_type];
3172         }
3173     }
3174   else
3175     {
3176       Lisp_Object val2;
3177
3178       /* At first, gather possible coding-systems in VAL in a reverse
3179          order.  */
3180       val = Qnil;
3181       for (val2 = Vcoding_category_list;
3182            !NILP (val2);
3183            val2 = XCONS (val2)->cdr)
3184         {
3185           int idx
3186             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3187           if (coding_mask & (1 << idx))
3188             {
3189 #if 0
3190               /* This code is suppressed until we find a better way to
3191                  distinguish raw text file and binary file.  */
3192
3193               if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3194                   && eol_type == CODING_EOL_INCONSISTENT)
3195                 val = Fcons (Qno_conversion, val);
3196               else
3197 #endif /* 0 */
3198                 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3199             }
3200         }
3201
3202       /* Then, change the order of the list, while getting subsidiary
3203          coding-systems.  */
3204       val2 = val;
3205       val = Qnil;
3206       if (eol_type == CODING_EOL_INCONSISTENT)
3207         eol_type == CODING_EOL_UNDECIDED;
3208       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3209         {
3210           if (eol_type == CODING_EOL_UNDECIDED)
3211             val = Fcons (XCONS (val2)->car, val);
3212           else
3213             {
3214               Lisp_Object val3;
3215               val3 = Fget (XCONS (val2)->car, Qeol_type);
3216               if (VECTORP (val3))
3217                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3218               else
3219                 val = Fcons (XCONS (val2)->car, val);
3220             }
3221         }
3222     }
3223
3224   return val;
3225 }
3226
3227 /* Scan text in the region between *BEGP and *ENDP, skip characters
3228    which we never have to encode to (iff ENCODEP is 1) or decode from
3229    coding system CODING at the head and tail, then set BEGP and ENDP
3230    to the addresses of start and end of the text we actually convert.  */
3231
3232 void
3233 shrink_conversion_area (begp, endp, coding, encodep)
3234      unsigned char **begp, **endp;
3235      struct coding_system *coding;
3236      int encodep;
3237 {
3238   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3239
3240   if (coding->eol_type != CODING_EOL_LF
3241       && coding->eol_type != CODING_EOL_UNDECIDED)
3242     /* Since we anyway have to convert end-of-line format, it is not
3243        worth skipping at most 100 bytes or so.  */
3244     return;
3245
3246   if (encodep)                  /* for encoding */
3247     {
3248       switch (coding->type)
3249         {
3250         case coding_type_no_conversion:
3251         case coding_type_emacs_mule:
3252         case coding_type_undecided:
3253         case coding_type_raw_text:
3254           /* We need no conversion.  */
3255           *begp = *endp;
3256           return;
3257         case coding_type_ccl:
3258           /* We can't skip any data.  */
3259           return;
3260         case coding_type_iso2022:
3261           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3262             {
3263               unsigned char *bol = beg_addr;
3264               while (beg_addr < end_addr && *beg_addr < 0x80)
3265                 {
3266                   beg_addr++;
3267                   if (*(beg_addr - 1) == '\n')
3268                     bol = beg_addr;
3269                 }
3270               beg_addr = bol;
3271               goto label_skip_tail;
3272             }
3273           /* fall down ... */
3274         default:
3275           /* We can skip all ASCII characters at the head and tail.  */
3276           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3277         label_skip_tail:
3278           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3279           break;
3280         }
3281     }
3282   else                          /* for decoding */
3283     {
3284       switch (coding->type)
3285         {
3286         case coding_type_no_conversion:
3287           /* We need no conversion.  */
3288           *begp = *endp;
3289           return;
3290         case coding_type_emacs_mule:
3291         case coding_type_raw_text:
3292           if (coding->eol_type == CODING_EOL_LF)
3293             {
3294               /* We need no conversion.  */
3295               *begp = *endp;
3296               return;
3297             }
3298           /* We can skip all but carriage-return.  */
3299           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3300           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3301           break;
3302         case coding_type_sjis:
3303         case coding_type_big5:
3304           /* We can skip all ASCII characters at the head.  */
3305           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3306           /* We can skip all ASCII characters at the tail except for
3307              the second byte of SJIS or BIG5 code.  */
3308           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3309           if (end_addr != *endp)
3310             end_addr++;
3311           break;
3312         case coding_type_ccl:
3313           /* We can't skip any data.  */
3314           return;
3315         default:                /* i.e. case coding_type_iso2022: */
3316           {
3317             unsigned char c;
3318
3319             /* We can skip all ASCII characters except for a few
3320                control codes at the head.  */
3321             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3322                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3323                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3324               beg_addr++;
3325           }
3326           break;
3327         }
3328     }
3329   *begp = beg_addr;
3330   *endp = end_addr;
3331   return;
3332 }
3333
3334 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3335    text between B and E.  B and E are buffer position.  */
3336
3337 Lisp_Object
3338 code_convert_region (b, e, coding, encodep)
3339      Lisp_Object b, e;
3340      struct coding_system *coding;
3341      int encodep;
3342 {
3343   int beg, end, len, consumed, produced;
3344   char *buf;
3345   unsigned char *begp, *endp;
3346   int pos = PT;
3347
3348   validate_region (&b, &e);
3349   beg = XINT (b), end = XINT (e);
3350   if (beg < GPT && end >= GPT)
3351     move_gap (end);
3352
3353   if (encodep && !NILP (coding->pre_write_conversion))
3354     {
3355       /* We must call a pre-conversion function which may put a new
3356          text to be converted in a new buffer.  */
3357       struct buffer *old = current_buffer, *new;
3358
3359       TEMP_SET_PT (beg);
3360       call2 (coding->pre_write_conversion, b, e);
3361       if (old != current_buffer)
3362         {
3363           /* Replace the original text by the text just generated.  */
3364           len = ZV - BEGV;
3365           new = current_buffer;
3366           set_buffer_internal (old);
3367           del_range (beg, end);
3368           insert_from_buffer (new, 1, len, 0);
3369           end = beg + len;
3370         }
3371     }
3372
3373   /* We may be able to shrink the conversion region.  */
3374   begp = POS_ADDR (beg); endp = begp + (end - beg);
3375   shrink_conversion_area (&begp, &endp, coding, encodep);
3376
3377   if (begp == endp)
3378     /* We need no conversion.  */
3379     len = end - beg;
3380   else
3381     {
3382       beg += begp - POS_ADDR (beg);
3383       end =  beg + (endp - begp);
3384
3385       if (encodep)
3386         len = encoding_buffer_size (coding, end - beg);
3387       else
3388         len = decoding_buffer_size (coding, end - beg);
3389       buf = get_conversion_buffer (len);
3390
3391       coding->last_block = 1;
3392       produced = (encodep
3393                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3394                                    &consumed)
3395                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3396                                    &consumed));
3397
3398       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3399
3400       TEMP_SET_PT (beg);
3401       insert (buf, produced);
3402       del_range (PT, PT + end - beg);
3403       if (pos >= end)
3404         pos = PT + (pos - end);
3405       else if (pos > beg)
3406         pos = beg;
3407       TEMP_SET_PT (pos);
3408   }
3409
3410   if (!encodep && !NILP (coding->post_read_conversion))
3411     {
3412       /* We must call a post-conversion function which may alter
3413          the text just converted.  */
3414       Lisp_Object insval;
3415
3416       beg = XINT (b);
3417       TEMP_SET_PT (beg);
3418       insval = call1 (coding->post_read_conversion, make_number (len));
3419       CHECK_NUMBER (insval, 0);
3420       len = XINT (insval);
3421     }
3422
3423   return make_number (len);
3424 }
3425
3426 Lisp_Object
3427 code_convert_string (str, coding, encodep, nocopy)
3428      Lisp_Object str, nocopy;
3429      struct coding_system *coding;
3430      int encodep;
3431 {
3432   int len, consumed, produced;
3433   char *buf;
3434   unsigned char *begp, *endp;
3435   int head_skip, tail_skip;
3436   struct gcpro gcpro1;
3437
3438   if (encodep && !NILP (coding->pre_write_conversion)
3439       || !encodep && !NILP (coding->post_read_conversion))
3440     {
3441       /* Since we have to call Lisp functions which assume target text
3442          is in a buffer, after setting a temporary buffer, call
3443          code_convert_region.  */
3444       int count = specpdl_ptr - specpdl;
3445       int len = XSTRING (str)->size;
3446       Lisp_Object result;
3447       struct buffer *old = current_buffer;
3448
3449       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3450       temp_output_buffer_setup (" *code-converting-work*");
3451       set_buffer_internal (XBUFFER (Vstandard_output));
3452       insert_from_string (str, 0, len, 0);
3453       code_convert_region (make_number (BEGV), make_number (ZV),
3454                            coding, encodep);
3455       result = make_buffer_string (BEGV, ZV, 0);
3456       set_buffer_internal (old);
3457       return unbind_to (count, result);
3458     }
3459
3460   /* We may be able to shrink the conversion region.  */
3461   begp = XSTRING (str)->data;
3462   endp = begp + XSTRING (str)->size;
3463   shrink_conversion_area (&begp, &endp, coding, encodep);
3464
3465   if (begp == endp)
3466     /* We need no conversion.  */
3467     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3468
3469   head_skip = begp - XSTRING (str)->data;
3470   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3471
3472   GCPRO1 (str);
3473
3474   if (encodep)
3475     len = encoding_buffer_size (coding, endp - begp);
3476   else
3477     len = decoding_buffer_size (coding, endp - begp);
3478   buf = get_conversion_buffer (len + head_skip + tail_skip);
3479
3480   bcopy (XSTRING (str)->data, buf, head_skip);
3481   coding->last_block = 1;
3482   produced = (encodep
3483               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3484                                buf + head_skip, endp - begp, len, &consumed)
3485               : decode_coding (coding, XSTRING (str)->data + head_skip,
3486                                buf + head_skip, endp - begp, len, &consumed));
3487   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3488          buf + head_skip + produced,
3489          tail_skip);
3490
3491   UNGCPRO;
3492
3493   return make_string (buf, head_skip + produced + tail_skip);
3494 }
3495
3496 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3497        3, 3, "r\nzCoding system: ",
3498   "Decode current region by specified coding system.\n\
3499 When called from a program, takes three arguments:\n\
3500 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3501 Return length of decoded text.")
3502   (b, e, coding_system)
3503      Lisp_Object b, e, coding_system;
3504 {
3505   struct coding_system coding;
3506
3507   CHECK_NUMBER_COERCE_MARKER (b, 0);
3508   CHECK_NUMBER_COERCE_MARKER (e, 1);
3509   CHECK_SYMBOL (coding_system, 2);
3510
3511   if (NILP (coding_system))
3512     return make_number (XFASTINT (e) - XFASTINT (b));
3513   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3514     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3515
3516   return code_convert_region (b, e, &coding, 0);
3517 }
3518
3519 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3520        3, 3, "r\nzCoding system: ",
3521   "Encode current region by specified coding system.\n\
3522 When called from a program, takes three arguments:\n\
3523 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3524 Return length of encoded text.")
3525   (b, e, coding_system)
3526      Lisp_Object b, e, coding_system;
3527 {
3528   struct coding_system coding;
3529
3530   CHECK_NUMBER_COERCE_MARKER (b, 0);
3531   CHECK_NUMBER_COERCE_MARKER (e, 1);
3532   CHECK_SYMBOL (coding_system, 2);
3533
3534   if (NILP (coding_system))
3535     return make_number (XFASTINT (e) - XFASTINT (b));
3536   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3537     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3538
3539   return code_convert_region (b, e, &coding, 1);
3540 }
3541
3542 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3543        2, 3, 0,
3544   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3545 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3546 of decoding.")
3547   (string, coding_system, nocopy)
3548      Lisp_Object string, coding_system, nocopy;
3549 {
3550   struct coding_system coding;
3551
3552   CHECK_STRING (string, 0);
3553   CHECK_SYMBOL (coding_system, 1);
3554
3555   if (NILP (coding_system))
3556     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3557   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3558     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3559
3560   return code_convert_string (string, &coding, 0, nocopy);
3561 }
3562
3563 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3564        2, 3, 0,
3565   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3566 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3567 of encoding.")
3568   (string, coding_system, nocopy)
3569      Lisp_Object string, coding_system, nocopy;
3570 {
3571   struct coding_system coding;
3572
3573   CHECK_STRING (string, 0);
3574   CHECK_SYMBOL (coding_system, 1);
3575
3576   if (NILP (coding_system))
3577     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3578   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3579     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3580
3581   return code_convert_string (string, &coding, 1, nocopy);
3582 }
3583
3584 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3585   "Decode a JISX0208 character of shift-jis encoding.\n\
3586 CODE is the character code in SJIS.\n\
3587 Return the corresponding character.")
3588   (code)
3589      Lisp_Object code;
3590 {
3591   unsigned char c1, c2, s1, s2;
3592   Lisp_Object val;
3593
3594   CHECK_NUMBER (code, 0);
3595   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3596   DECODE_SJIS (s1, s2, c1, c2);
3597   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3598   return val;
3599 }
3600
3601 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3602   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3603 Return the corresponding character code in SJIS.")
3604   (ch)
3605      Lisp_Object ch;
3606 {
3607   int charset, c1, c2, s1, s2;
3608   Lisp_Object val;
3609
3610   CHECK_NUMBER (ch, 0);
3611   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3612   if (charset == charset_jisx0208)
3613     {
3614       ENCODE_SJIS (c1, c2, s1, s2);
3615       XSETFASTINT (val, (s1 << 8) | s2);
3616     }
3617   else
3618     XSETFASTINT (val, 0);
3619   return val;
3620 }
3621
3622 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3623   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3624 CODE is the character code in BIG5.\n\
3625 Return the corresponding character.")
3626   (code)
3627      Lisp_Object code;
3628 {
3629   int charset;
3630   unsigned char b1, b2, c1, c2;
3631   Lisp_Object val;
3632
3633   CHECK_NUMBER (code, 0);
3634   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3635   DECODE_BIG5 (b1, b2, charset, c1, c2);
3636   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3637   return val;
3638 }
3639
3640 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3641   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3642 Return the corresponding character code in Big5.")
3643   (ch)
3644      Lisp_Object ch;
3645 {
3646   int charset, c1, c2, b1, b2;
3647   Lisp_Object val;
3648
3649   CHECK_NUMBER (ch, 0);
3650   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3651   if (charset == charset_big5_1 || charset == charset_big5_2)
3652     {
3653       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3654       XSETFASTINT (val, (b1 << 8) | b2);
3655     }
3656   else
3657     XSETFASTINT (val, 0);
3658   return val;
3659 }
3660
3661 DEFUN ("set-terminal-coding-system-internal",
3662        Fset_terminal_coding_system_internal,
3663        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3664   (coding_system)
3665      Lisp_Object coding_system;
3666 {
3667   CHECK_SYMBOL (coding_system, 0);
3668   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3669   /* We had better not send unexpected characters to terminal.  */
3670   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3671
3672   return Qnil;
3673 }
3674
3675 DEFUN ("set-safe-terminal-coding-system-internal",
3676        Fset_safe_terminal_coding_system_internal,
3677        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3678   (coding_system)
3679      Lisp_Object coding_system;
3680 {
3681   CHECK_SYMBOL (coding_system, 0);
3682   setup_coding_system (Fcheck_coding_system (coding_system),
3683                        &safe_terminal_coding);
3684   return Qnil;
3685 }
3686
3687 DEFUN ("terminal-coding-system",
3688        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3689   "Return coding-system of your terminal.")
3690   ()
3691 {
3692   return terminal_coding.symbol;
3693 }
3694
3695 DEFUN ("set-keyboard-coding-system-internal",
3696        Fset_keyboard_coding_system_internal,
3697        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3698   (coding_system)
3699      Lisp_Object coding_system;
3700 {
3701   CHECK_SYMBOL (coding_system, 0);
3702   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3703   return Qnil;
3704 }
3705
3706 DEFUN ("keyboard-coding-system",
3707        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3708   "Return coding-system of what is sent from terminal keyboard.")
3709   ()
3710 {
3711   return keyboard_coding.symbol;
3712 }
3713
3714 \f
3715 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3716        Sfind_operation_coding_system,  1, MANY, 0,
3717   "Choose a coding system for an operation based on the target name.\n\
3718 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3719 DECODING-SYSTEM is the coding system to use for decoding\n\
3720 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3721 for encoding (in case OPERATION does encoding).\n\
3722 \n\
3723 The first argument OPERATION specifies an I/O primitive:\n\
3724   For file I/O, `insert-file-contents' or `write-region'.\n\
3725   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3726   For network I/O, `open-network-stream'.\n\
3727 \n\
3728 The remaining arguments should be the same arguments that were passed\n\
3729 to the primitive.  Depending on which primitive, one of those arguments\n\
3730 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3731 whichever argument specifies the file name is TARGET.\n\
3732 \n\
3733 TARGET has a meaning which depends on OPERATION:\n\
3734   For file I/O, TARGET is a file name.\n\
3735   For process I/O, TARGET is a process name.\n\
3736   For network I/O, TARGET is a service name or a port number\n\
3737 \n\
3738 This function looks up what specified for TARGET in,\n\
3739 `file-coding-system-alist', `process-coding-system-alist',\n\
3740 or `network-coding-system-alist' depending on OPERATION.\n\
3741 They may specify a coding system, a cons of coding systems,\n\
3742 or a function symbol to call.\n\
3743 In the last case, we call the function with one argument,\n\
3744 which is a list of all the arguments given to this function.")
3745   (nargs, args)
3746      int nargs;
3747      Lisp_Object *args;
3748 {
3749   Lisp_Object operation, target_idx, target, val;
3750   register Lisp_Object chain;
3751
3752   if (nargs < 2)
3753     error ("Too few arguments");
3754   operation = args[0];
3755   if (!SYMBOLP (operation)
3756       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3757     error ("Invalid first arguement");
3758   if (nargs < 1 + XINT (target_idx))
3759     error ("Too few arguments for operation: %s",
3760            XSYMBOL (operation)->name->data);
3761   target = args[XINT (target_idx) + 1];
3762   if (!(STRINGP (target)
3763         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3764     error ("Invalid %dth argument", XINT (target_idx) + 1);
3765
3766   chain = ((EQ (operation, Qinsert_file_contents)
3767             || EQ (operation, Qwrite_region))
3768            ? Vfile_coding_system_alist
3769            : (EQ (operation, Qopen_network_stream)
3770               ? Vnetwork_coding_system_alist
3771               : Vprocess_coding_system_alist));
3772   if (NILP (chain))
3773     return Qnil;
3774
3775   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3776     {
3777       Lisp_Object elt;
3778       elt = XCONS (chain)->car;
3779
3780       if (CONSP (elt)
3781           && ((STRINGP (target)
3782                && STRINGP (XCONS (elt)->car)
3783                && fast_string_match (XCONS (elt)->car, target) >= 0)
3784               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3785         {
3786           val = XCONS (elt)->cdr;
3787           if (CONSP (val))
3788             return val;
3789           if (! SYMBOLP (val))
3790             return Qnil;
3791           if (! NILP (Fcoding_system_p (val)))
3792             return Fcons (val, val);
3793           if (!NILP (Ffboundp (val)))
3794             return call1 (val, Flist (nargs, args));
3795           return Qnil;
3796         }
3797     }
3798   return Qnil;
3799 }
3800
3801 #endif /* emacs */
3802
3803 \f
3804 /*** 8. Post-amble ***/
3805
3806 init_coding_once ()
3807 {
3808   int i;
3809
3810   /* Emacs' internal format specific initialize routine.  */
3811   for (i = 0; i <= 0x20; i++)
3812     emacs_code_class[i] = EMACS_control_code;
3813   emacs_code_class[0x0A] = EMACS_linefeed_code;
3814   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3815   for (i = 0x21 ; i < 0x7F; i++)
3816     emacs_code_class[i] = EMACS_ascii_code;
3817   emacs_code_class[0x7F] = EMACS_control_code;
3818   emacs_code_class[0x80] = EMACS_leading_code_composition;
3819   for (i = 0x81; i < 0xFF; i++)
3820     emacs_code_class[i] = EMACS_invalid_code;
3821   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3822   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3823   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3824   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3825
3826   /* ISO2022 specific initialize routine.  */
3827   for (i = 0; i < 0x20; i++)
3828     iso_code_class[i] = ISO_control_code;
3829   for (i = 0x21; i < 0x7F; i++)
3830     iso_code_class[i] = ISO_graphic_plane_0;
3831   for (i = 0x80; i < 0xA0; i++)
3832     iso_code_class[i] = ISO_control_code;
3833   for (i = 0xA1; i < 0xFF; i++)
3834     iso_code_class[i] = ISO_graphic_plane_1;
3835   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3836   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3837   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3838   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3839   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3840   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3841   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3842   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3843   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3844   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3845
3846   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3847   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3848
3849   setup_coding_system (Qnil, &keyboard_coding);
3850   setup_coding_system (Qnil, &terminal_coding);
3851   setup_coding_system (Qnil, &safe_terminal_coding);
3852
3853 #if defined (MSDOS) || defined (WINDOWSNT)
3854   system_eol_type = CODING_EOL_CRLF;
3855 #else
3856   system_eol_type = CODING_EOL_LF;
3857 #endif
3858 }
3859
3860 #ifdef emacs
3861
3862 syms_of_coding ()
3863 {
3864   Qtarget_idx = intern ("target-idx");
3865   staticpro (&Qtarget_idx);
3866
3867   Qcoding_system_history = intern ("coding-system-history");
3868   staticpro (&Qcoding_system_history);
3869   Fset (Qcoding_system_history, Qnil);
3870
3871   /* Target FILENAME is the first argument.  */
3872   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3873   /* Target FILENAME is the third argument.  */
3874   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3875
3876   Qcall_process = intern ("call-process");
3877   staticpro (&Qcall_process);
3878   /* Target PROGRAM is the first argument.  */
3879   Fput (Qcall_process, Qtarget_idx, make_number (0));
3880
3881   Qcall_process_region = intern ("call-process-region");
3882   staticpro (&Qcall_process_region);
3883   /* Target PROGRAM is the third argument.  */
3884   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3885
3886   Qstart_process = intern ("start-process");
3887   staticpro (&Qstart_process);
3888   /* Target PROGRAM is the third argument.  */
3889   Fput (Qstart_process, Qtarget_idx, make_number (2));
3890
3891   Qopen_network_stream = intern ("open-network-stream");
3892   staticpro (&Qopen_network_stream);
3893   /* Target SERVICE is the fourth argument.  */
3894   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3895
3896   Qcoding_system = intern ("coding-system");
3897   staticpro (&Qcoding_system);
3898
3899   Qeol_type = intern ("eol-type");
3900   staticpro (&Qeol_type);
3901
3902   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3903   staticpro (&Qbuffer_file_coding_system);
3904
3905   Qpost_read_conversion = intern ("post-read-conversion");
3906   staticpro (&Qpost_read_conversion);
3907
3908   Qpre_write_conversion = intern ("pre-write-conversion");
3909   staticpro (&Qpre_write_conversion);
3910
3911   Qno_conversion = intern ("no-conversion");
3912   staticpro (&Qno_conversion);
3913
3914   Qundecided = intern ("undecided");
3915   staticpro (&Qundecided);
3916
3917   Qcoding_system_spec = intern ("coding-system-spec");
3918   staticpro (&Qcoding_system_spec);
3919
3920   Qcoding_system_p = intern ("coding-system-p");
3921   staticpro (&Qcoding_system_p);
3922
3923   Qcoding_system_error = intern ("coding-system-error");
3924   staticpro (&Qcoding_system_error);
3925
3926   Fput (Qcoding_system_error, Qerror_conditions,
3927         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3928   Fput (Qcoding_system_error, Qerror_message,
3929         build_string ("Invalid coding system"));
3930
3931   Qcoding_category_index = intern ("coding-category-index");
3932   staticpro (&Qcoding_category_index);
3933
3934   {
3935     int i;
3936     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3937       {
3938         coding_category_table[i] = intern (coding_category_name[i]);
3939         staticpro (&coding_category_table[i]);
3940         Fput (coding_category_table[i], Qcoding_category_index,
3941               make_number (i));
3942       }
3943   }
3944
3945   Qcharacter_unification_table = intern ("character-unification-table");
3946   staticpro (&Qcharacter_unification_table);
3947   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3948         make_number (0));
3949
3950   Qcharacter_unification_table_for_decode
3951     = intern ("character-unification-table-for-decode");
3952   staticpro (&Qcharacter_unification_table_for_decode);
3953
3954   Qcharacter_unification_table_for_encode
3955     = intern ("character-unification-table-for-encode");
3956   staticpro (&Qcharacter_unification_table_for_encode);
3957
3958   Qemacs_mule = intern ("emacs-mule");
3959   staticpro (&Qemacs_mule);
3960
3961   defsubr (&Scoding_system_spec);
3962   defsubr (&Scoding_system_p);
3963   defsubr (&Sread_coding_system);
3964   defsubr (&Sread_non_nil_coding_system);
3965   defsubr (&Scheck_coding_system);
3966   defsubr (&Sdetect_coding_region);
3967   defsubr (&Sdecode_coding_region);
3968   defsubr (&Sencode_coding_region);
3969   defsubr (&Sdecode_coding_string);
3970   defsubr (&Sencode_coding_string);
3971   defsubr (&Sdecode_sjis_char);
3972   defsubr (&Sencode_sjis_char);
3973   defsubr (&Sdecode_big5_char);
3974   defsubr (&Sencode_big5_char);
3975   defsubr (&Sset_terminal_coding_system_internal);
3976   defsubr (&Sset_safe_terminal_coding_system_internal);
3977   defsubr (&Sterminal_coding_system);
3978   defsubr (&Sset_keyboard_coding_system_internal);
3979   defsubr (&Skeyboard_coding_system);
3980   defsubr (&Sfind_operation_coding_system);
3981
3982   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3983     "List of coding-categories (symbols) ordered by priority.");
3984   {
3985     int i;
3986
3987     Vcoding_category_list = Qnil;
3988     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3989       Vcoding_category_list
3990         = Fcons (coding_category_table[i], Vcoding_category_list);
3991   }
3992
3993   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3994     "Specify the coding system for read operations.\n\
3995 It is useful to bind this variable with `let', but do not set it globally.\n\
3996 If the value is a coding system, it is used for decoding on read operation.\n\
3997 If not, an appropriate element is used from one of the coding system alists:\n\
3998 There are three such tables, `file-coding-system-alist',\n\
3999 `process-coding-system-alist', and `network-coding-system-alist'.");
4000   Vcoding_system_for_read = Qnil;
4001
4002   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
4003     "Specify the coding system for write operations.\n\
4004 It is useful to bind this variable with `let', but do not set it globally.\n\
4005 If the value is a coding system, it is used for encoding on write operation.\n\
4006 If not, an appropriate element is used from one of the coding system alists:\n\
4007 There are three such tables, `file-coding-system-alist',\n\
4008 `process-coding-system-alist', and `network-coding-system-alist'.");
4009   Vcoding_system_for_write = Qnil;
4010
4011   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4012     "Coding system used in the latest file or process I/O.");
4013   Vlast_coding_system_used = Qnil;
4014
4015   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4016     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4017   inhibit_eol_conversion = 0;
4018
4019   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4020     "Alist to decide a coding system to use for a file I/O operation.\n\
4021 The format is ((PATTERN . VAL) ...),\n\
4022 where PATTERN is a regular expression matching a file name,\n\
4023 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4024 If VAL is a coding system, it is used for both decoding and encoding\n\
4025 the file contents.\n\
4026 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4027 and the cdr part is used for encoding.\n\
4028 If VAL is a function symbol, the function must return a coding system\n\
4029 or a cons of coding systems which are used as above.\n\
4030 \n\
4031 See also the function `find-operation-coding-system'.");
4032   Vfile_coding_system_alist = Qnil;
4033
4034   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4035     "Alist to decide a coding system to use for a process I/O operation.\n\
4036 The format is ((PATTERN . VAL) ...),\n\
4037 where PATTERN is a regular expression matching a program name,\n\
4038 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4039 If VAL is a coding system, it is used for both decoding what received\n\
4040 from the program and encoding what sent to the program.\n\
4041 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4042 and the cdr part is used for encoding.\n\
4043 If VAL is a function symbol, the function must return a coding system\n\
4044 or a cons of coding systems which are used as above.\n\
4045 \n\
4046 See also the function `find-operation-coding-system'.");
4047   Vprocess_coding_system_alist = Qnil;
4048
4049   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4050     "Alist to decide a coding system to use for a network I/O operation.\n\
4051 The format is ((PATTERN . VAL) ...),\n\
4052 where PATTERN is a regular expression matching a network service name\n\
4053 or is a port number to connect to,\n\
4054 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4055 If VAL is a coding system, it is used for both decoding what received\n\
4056 from the network stream and encoding what sent to the network stream.\n\
4057 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4058 and the cdr part is used for encoding.\n\
4059 If VAL is a function symbol, the function must return a coding system\n\
4060 or a cons of coding systems which are used as above.\n\
4061 \n\
4062 See also the function `find-operation-coding-system'.");
4063   Vnetwork_coding_system_alist = Qnil;
4064
4065   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4066     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4067   eol_mnemonic_unix = ':';
4068
4069   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4070     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4071   eol_mnemonic_dos = '\\';
4072
4073   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4074     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4075   eol_mnemonic_mac = '/';
4076
4077   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4078     "Mnemonic character indicating end-of-line format is not yet decided.");
4079   eol_mnemonic_undecided = ':';
4080
4081   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4082     "Non-nil means ISO 2022 encoder/decoder do character unification.");
4083   Venable_character_unification = Qt;
4084
4085   DEFVAR_LISP ("standard-character-unification-table-for-decode",
4086     &Vstandard_character_unification_table_for_decode,
4087     "Table for unifying characters when reading.");
4088   Vstandard_character_unification_table_for_decode = Qnil;
4089
4090   DEFVAR_LISP ("standard-character-unification-table-for-encode",
4091     &Vstandard_character_unification_table_for_encode,
4092     "Table for unifying characters when writing.");
4093   Vstandard_character_unification_table_for_encode = Qnil;
4094
4095   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4096     "Alist of charsets vs revision numbers.\n\
4097 While encoding, if a charset (car part of an element) is found,\n\
4098 designate it with the escape sequence identifing revision (cdr part of the element).");
4099   Vcharset_revision_alist = Qnil;
4100
4101   DEFVAR_LISP ("default-process-coding-system",
4102                &Vdefault_process_coding_system,
4103     "Cons of coding systems used for process I/O by default.\n\
4104 The car part is used for decoding a process output,\n\
4105 the cdr part is used for encoding a text to be sent to a process.");
4106   Vdefault_process_coding_system = Qnil;
4107
4108   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4109     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4110 This is a vector of length 256.\n\
4111 If Nth element is non-nil, the existence of code N in a file\n\
4112 \(or output of subprocess) doesn't prevent it to be detected as\n\
4113 a coding system of ISO 2022 variant which has a flag\n\
4114 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4115 or reading output of a subprocess.\n\
4116 Only 128th through 159th elements has a meaning.");
4117   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4118 }
4119
4120 #endif /* emacs */