src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Raw text
  71
  72   A coding system for a text containing random 8-bit code.  Emacs does
  73   no code conversion on such a text except for end-of-line format.
  74
  75   5. Other
  76
  77   If a user wants to read/write a text encoded in a coding system not
  78   listed above, he can supply a decoder and an encoder for it in CCL
  79   (Code Conversion Language) programs.  Emacs executes the CCL program
  80   while reading/writing.
  81
  82   Emacs represents a coding-system by a Lisp symbol that has a property
  83   `coding-system'.  But, before actually using the coding-system, the
  84   information about it is set in a structure of type `struct
  85   coding_system' for rapid processing.  See section 6 for more details.
  86
  87 */
  88
  89 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  90
  91   How end-of-line of a text is encoded depends on a system.  For
  92   instance, Unix's format is just one byte of `line-feed' code,
  93   whereas DOS's format is two-byte sequence of `carriage-return' and
  94   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  95
  96   Since text characters encoding and end-of-line encoding are
  97   independent, any coding system described above can take
  98   any format of end-of-line.  So, Emacs has information of format of
  99   end-of-line in each coding-system.  See section 6 for more details.
 100
 101 */
 102
 103 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 104
 105   These functions check if a text between SRC and SRC_END is encoded
 106   in the coding system category XXX.  Each returns an integer value in
 107   which appropriate flag bits for the category XXX is set.  The flag
 108   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 109   template of these functions.  */
 110 #if 0
 111 int
 112 detect_coding_emacs_mule (src, src_end)
 113      unsigned char *src, *src_end;
 114 {
 115   ...
 116 }
 117 #endif
 118
 119 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 120
 121   These functions decode SRC_BYTES length text at SOURCE encoded in
 122   CODING to Emacs' internal format (emacs-mule).  The resulting text
 123   goes to a place pointed to by DESTINATION, the length of which should
 124   not exceed DST_BYTES.  The number of bytes actually processed is
 125   returned as *CONSUMED.  The return value is the length of the decoded
 126   text.  Below is a template of these functions.  */
 127 #if 0
 128 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 129      struct coding_system *coding;
 130      unsigned char *source, *destination;
 131      int src_bytes, dst_bytes;
 132      int *consumed;
 133 {
 134   ...
 135 }
 136 #endif
 137
 138 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 139
 140   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 141   internal format (emacs-mule) to CODING.  The resulting text goes to
 142   a place pointed to by DESTINATION, the length of which should not
 143   exceed DST_BYTES.  The number of bytes actually processed is
 144   returned as *CONSUMED.  The return value is the length of the
 145   encoded text.  Below is a template of these functions.  */
 146 #if 0
 147 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 148      struct coding_system *coding;
 149      unsigned char *source, *destination;
 150      int src_bytes, dst_bytes;
 151      int *consumed;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** COMMONLY USED MACROS ***/
 158
 159 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 160    THREE_MORE_BYTES safely get one, two, and three bytes from the
 161    source text respectively.  If there are not enough bytes in the
 162    source, they jump to `label_end_of_loop'.  The caller should set
 163    variables `src' and `src_end' to appropriate areas in advance.  */
 164
 165 #define ONE_MORE_BYTE(c1)       \
 166   do {                          \
 167     if (src < src_end)          \
 168       c1 = *src++;              \
 169     else                        \
 170       goto label_end_of_loop;   \
 171   } while (0)
 172
 173 #define TWO_MORE_BYTES(c1, c2)  \
 174   do {                          \
 175     if (src + 1 < src_end)      \
 176       c1 = *src++, c2 = *src++; \
 177     else                        \
 178       goto label_end_of_loop;   \
 179   } while (0)
 180
 181 #define THREE_MORE_BYTES(c1, c2, c3)            \
 182   do {                                          \
 183     if (src + 2 < src_end)                      \
 184       c1 = *src++, c2 = *src++, c3 = *src++;    \
 185     else                                        \
 186       goto label_end_of_loop;                   \
 187   } while (0)
 188
 189 /* The following three macros DECODE_CHARACTER_ASCII,
 190    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 191    the multi-byte form of a character of each class at the place
 192    pointed by `dst'.  The caller should set the variable `dst' to
 193    point to an appropriate area and the variable `coding' to point to
 194    the coding-system of the currently decoding text in advance.  */
 195
 196 /* Decode one ASCII character C.  */
 197
 198 #define DECODE_CHARACTER_ASCII(c)                               \
 199   do {                                                          \
 200     if (COMPOSING_P (coding->composing))                        \
 201       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 202     else                                                        \
 203       *dst++ = (c);                                             \
 204   } while (0)
 205
 206 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 207    position-code is C.  */
 208
 209 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 210   do {                                                                  \
 211     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 212     if (COMPOSING_P (coding->composing))                                \
 213       *dst++ = leading_code + 0x20;                                     \
 214     else                                                                \
 215       *dst++ = leading_code;                                            \
 216     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 217       *dst++ = leading_code;                                            \
 218     *dst++ = (c) | 0x80;                                                \
 219   } while (0)
 220
 221 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 222    position-codes are C1 and C2.  */
 223
 224 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 225   do {                                                  \
 226     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 227     *dst++ = (c2) | 0x80;                               \
 228   } while (0)
 229
 230 \f
 231 /*** 1. Preamble ***/
 232
 233 #include <stdio.h>
 234
 235 #ifdef emacs
 236
 237 #include <config.h>
 238 #include "lisp.h"
 239 #include "buffer.h"
 240 #include "charset.h"
 241 #include "ccl.h"
 242 #include "coding.h"
 243 #include "window.h"
 244
 245 #else  /* not emacs */
 246
 247 #include "mulelib.h"
 248
 249 #endif /* not emacs */
 250
 251 Lisp_Object Qcoding_system, Qeol_type;
 252 Lisp_Object Qbuffer_file_coding_system;
 253 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 254 Lisp_Object Qno_conversion, Qundecided;
 255 Lisp_Object Qcoding_system_history;
 256 Lisp_Object Qsafe_charsets;
 257
 258 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 259 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 260 Lisp_Object Qstart_process, Qopen_network_stream;
 261 Lisp_Object Qtarget_idx;
 262
 263 /* Mnemonic character of each format of end-of-line.  */
 264 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 265 /* Mnemonic character to indicate format of end-of-line is not yet
 266    decided.  */
 267 int eol_mnemonic_undecided;
 268
 269 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 270    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 271 int system_eol_type;
 272
 273 #ifdef emacs
 274
 275 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 276
 277 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 278
 279 /* Coding system emacs-mule is for converting only end-of-line format.  */
 280 Lisp_Object Qemacs_mule;
 281
 282 /* Coding-systems are handed between Emacs Lisp programs and C internal
 283    routines by the following three variables.  */
 284 /* Coding-system for reading files and receiving data from process.  */
 285 Lisp_Object Vcoding_system_for_read;
 286 /* Coding-system for writing files and sending data to process.  */
 287 Lisp_Object Vcoding_system_for_write;
 288 /* Coding-system actually used in the latest I/O.  */
 289 Lisp_Object Vlast_coding_system_used;
 290
 291 /* A vector of length 256 which contains information about special
 292    Latin codes (espepcially for dealing with Microsoft code).  */
 293 Lisp_Object Vlatin_extra_code_table;
 294
 295 /* Flag to inhibit code conversion of end-of-line format.  */
 296 int inhibit_eol_conversion;
 297
 298 /* Coding system to be used to encode text for terminal display.  */
 299 struct coding_system terminal_coding;
 300
 301 /* Coding system to be used to encode text for terminal display when
 302    terminal coding system is nil.  */
 303 struct coding_system safe_terminal_coding;
 304
 305 /* Coding system of what is sent from terminal keyboard.  */
 306 struct coding_system keyboard_coding;
 307
 308 Lisp_Object Vfile_coding_system_alist;
 309 Lisp_Object Vprocess_coding_system_alist;
 310 Lisp_Object Vnetwork_coding_system_alist;
 311
 312 #endif /* emacs */
 313
 314 Lisp_Object Qcoding_category_index;
 315
 316 /* List of symbols `coding-category-xxx' ordered by priority.  */
 317 Lisp_Object Vcoding_category_list;
 318
 319 /* Table of coding-systems currently assigned to each coding-category.  */
 320 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 321
 322 /* Table of names of symbol for each coding-category.  */
 323 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 324   "coding-category-emacs-mule",
 325   "coding-category-sjis",
 326   "coding-category-iso-7",
 327   "coding-category-iso-8-1",
 328   "coding-category-iso-8-2",
 329   "coding-category-iso-7-else",
 330   "coding-category-iso-8-else",
 331   "coding-category-big5",
 332   "coding-category-raw-text",
 333   "coding-category-binary"
 334 };
 335
 336 /* Flag to tell if we look up unification table on character code
 337    conversion.  */
 338 Lisp_Object Venable_character_unification;
 339 /* Standard unification table to look up on decoding (reading).  */
 340 Lisp_Object Vstandard_character_unification_table_for_decode;
 341 /* Standard unification table to look up on encoding (writing).  */
 342 Lisp_Object Vstandard_character_unification_table_for_encode;
 343
 344 Lisp_Object Qcharacter_unification_table;
 345 Lisp_Object Qcharacter_unification_table_for_decode;
 346 Lisp_Object Qcharacter_unification_table_for_encode;
 347
 348 /* Alist of charsets vs revision number.  */
 349 Lisp_Object Vcharset_revision_alist;
 350
 351 /* Default coding systems used for process I/O.  */
 352 Lisp_Object Vdefault_process_coding_system;
 353
 354 \f
 355 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 356
 357 /* Emacs' internal format for encoding multiple character sets is a
 358    kind of multi-byte encoding, i.e. characters are encoded by
 359    variable-length sequences of one-byte codes.  ASCII characters
 360    and control characters (e.g. `tab', `newline') are represented by
 361    one-byte sequences which are their ASCII codes, in the range 0x00
 362    through 0x7F.  The other characters are represented by a sequence
 363    of `base leading-code', optional `extended leading-code', and one
 364    or two `position-code's.  The length of the sequence is determined
 365    by the base leading-code.  Leading-code takes the range 0x80
 366    through 0x9F, whereas extended leading-code and position-code take
 367    the range 0xA0 through 0xFF.  See `charset.h' for more details
 368    about leading-code and position-code.
 369
 370    There's one exception to this rule.  Special leading-code
 371    `leading-code-composition' denotes that the following several
 372    characters should be composed into one character.  Leading-codes of
 373    components (except for ASCII) are added 0x20.  An ASCII character
 374    component is represented by a 2-byte sequence of `0xA0' and
 375    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 376    details of composite character.  Hence, we can summarize the code
 377    range as follows:
 378
 379    --- CODE RANGE of Emacs' internal format ---
 380    (character set)      (range)
 381    ASCII                0x00 .. 0x7F
 382    ELSE (1st byte)      0x80 .. 0x9F
 383         (rest bytes)    0xA0 .. 0xFF
 384    ---------------------------------------------
 385
 386   */
 387
 388 enum emacs_code_class_type emacs_code_class[256];
 389
 390 /* Go to the next statement only if *SRC is accessible and the code is
 391    greater than 0xA0.  */
 392 #define CHECK_CODE_RANGE_A0_FF  \
 393   do {                          \
 394     if (src >= src_end)         \
 395       goto label_end_of_switch; \
 396     else if (*src++ < 0xA0)     \
 397       return 0;                 \
 398   } while (0)
 399
 400 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 401    Check if a text is encoded in Emacs' internal format.  If it is,
 402    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 403
 404 int
 405 detect_coding_emacs_mule (src, src_end)
 406      unsigned char *src, *src_end;
 407 {
 408   unsigned char c;
 409   int composing = 0;
 410
 411   while (src < src_end)
 412     {
 413       c = *src++;
 414
 415       if (composing)
 416         {
 417           if (c < 0xA0)
 418             composing = 0;
 419           else
 420             c -= 0x20;
 421         }
 422
 423       switch (emacs_code_class[c])
 424         {
 425         case EMACS_ascii_code:
 426         case EMACS_linefeed_code:
 427           break;
 428
 429         case EMACS_control_code:
 430           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 431             return 0;
 432           break;
 433
 434         case EMACS_invalid_code:
 435           return 0;
 436
 437         case EMACS_leading_code_composition: /* c == 0x80 */
 438           if (composing)
 439             CHECK_CODE_RANGE_A0_FF;
 440           else
 441             composing = 1;
 442           break;
 443
 444         case EMACS_leading_code_4:
 445           CHECK_CODE_RANGE_A0_FF;
 446           /* fall down to check it two more times ...  */
 447
 448         case EMACS_leading_code_3:
 449           CHECK_CODE_RANGE_A0_FF;
 450           /* fall down to check it one more time ...  */
 451
 452         case EMACS_leading_code_2:
 453           CHECK_CODE_RANGE_A0_FF;
 454           break;
 455
 456         default:
 457         label_end_of_switch:
 458           break;
 459         }
 460     }
 461   return CODING_CATEGORY_MASK_EMACS_MULE;
 462 }
 463
 464 \f
 465 /*** 3. ISO2022 handlers ***/
 466
 467 /* The following note describes the coding system ISO2022 briefly.
 468    Since the intention of this note is to help in understanding of
 469    the programs in this file, some parts are NOT ACCURATE or OVERLY
 470    SIMPLIFIED.  For the thorough understanding, please refer to the
 471    original document of ISO2022.
 472
 473    ISO2022 provides many mechanisms to encode several character sets
 474    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 475    all text is encoded by codes of less than 128.  This may make the
 476    encoded text a little bit longer, but the text gets more stability
 477    to pass through several gateways (some of them strip off the MSB).
 478
 479    There are two kinds of character set: control character set and
 480    graphic character set.  The former contains control characters such
 481    as `newline' and `escape' to provide control functions (control
 482    functions are provided also by escape sequences).  The latter
 483    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 484    two control character sets and many graphic character sets.
 485
 486    Graphic character sets are classified into one of the following
 487    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 488    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 489    bytes (DIMENSION) and the number of characters in one dimension
 490    (CHARS) of the set.  In addition, each character set is assigned an
 491    identification tag (called "final character" and denoted as <F>
 492    here after) which is unique in each class.  <F> of each character
 493    set is decided by ECMA(*) when it is registered in ISO.  Code range
 494    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 495
 496    Note (*): ECMA = European Computer Manufacturers Association
 497
 498    Here are examples of graphic character set [NAME(<F>)]:
 499         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 500         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 501         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 502         o DIMENSION2_CHARS96 -- none for the moment
 503
 504    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 505         C0 [0x00..0x1F] -- control character plane 0
 506         GL [0x20..0x7F] -- graphic character plane 0
 507         C1 [0x80..0x9F] -- control character plane 1
 508         GR [0xA0..0xFF] -- graphic character plane 1
 509
 510    A control character set is directly designated and invoked to C0 or
 511    C1 by an escape sequence.  The most common case is that ISO646's
 512    control character set is designated/invoked to C0 and ISO6429's
 513    control character set is designated/invoked to C1, and usually
 514    these designations/invocations are omitted in a coded text.  With
 515    7-bit environment, only C0 can be used, and a control character for
 516    C1 is encoded by an appropriate escape sequence to fit in the
 517    environment.  All control characters for C1 are defined the
 518    corresponding escape sequences.
 519
 520    A graphic character set is at first designated to one of four
 521    graphic registers (G0 through G3), then these graphic registers are
 522    invoked to GL or GR.  These designations and invocations can be
 523    done independently.  The most common case is that G0 is invoked to
 524    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 525    these invocations and designations are omitted in a coded text.
 526    With 7-bit environment, only GL can be used.
 527
 528    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 529    and 0x7F of GL area work as control characters SPACE and DEL
 530    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 531
 532    There are two ways of invocation: locking-shift and single-shift.
 533    With locking-shift, the invocation lasts until the next different
 534    invocation, whereas with single-shift, the invocation works only
 535    for the following character and doesn't affect locking-shift.
 536    Invocations are done by the following control characters or escape
 537    sequences.
 538
 539    ----------------------------------------------------------------------
 540    function             control char    escape sequence description
 541    ----------------------------------------------------------------------
 542    SI  (shift-in)               0x0F    none            invoke G0 to GL
 543    SO  (shift-out)              0x0E    none            invoke G1 to GL
 544    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 545    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 546    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 547    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 548    ----------------------------------------------------------------------
 549    The first four are for locking-shift.  Control characters for these
 550    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 551
 552    Designations are done by the following escape sequences.
 553    ----------------------------------------------------------------------
 554    escape sequence      description
 555    ----------------------------------------------------------------------
 556    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 557    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 558    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 559    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 560    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 561    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 562    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 563    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 564    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 565    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 566    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 567    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 568    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 569    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 570    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 571    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 572    ----------------------------------------------------------------------
 573
 574    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 575    of dimension 1, chars 94, and final character <F>, and etc.
 576
 577    Note (*): Although these designations are not allowed in ISO2022,
 578    Emacs accepts them on decoding, and produces them on encoding
 579    CHARS96 character set in a coding system which is characterized as
 580    7-bit environment, non-locking-shift, and non-single-shift.
 581
 582    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 583    '(' can be omitted.  We call this as "short-form" here after.
 584
 585    Now you may notice that there are a lot of ways for encoding the
 586    same multilingual text in ISO2022.  Actually, there exists many
 587    coding systems such as Compound Text (used in X's inter client
 588    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 589    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 590    localized platforms), and all of these are variants of ISO2022.
 591
 592    In addition to the above, Emacs handles two more kinds of escape
 593    sequences: ISO6429's direction specification and Emacs' private
 594    sequence for specifying character composition.
 595
 596    ISO6429's direction specification takes the following format:
 597         o CSI ']'      -- end of the current direction
 598         o CSI '0' ']'  -- end of the current direction
 599         o CSI '1' ']'  -- start of left-to-right text
 600         o CSI '2' ']'  -- start of right-to-left text
 601    The control character CSI (0x9B: control sequence introducer) is
 602    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 603
 604    Character composition specification takes the following format:
 605         o ESC '0' -- start character composition
 606         o ESC '1' -- end character composition
 607    Since these are not standard escape sequences of any ISO, the use
 608    of them for these meaning is restricted to Emacs only.  */
 609
 610 enum iso_code_class_type iso_code_class[256];
 611
 612 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 613    Check if a text is encoded in ISO2022.  If it is, returns an
 614    integer in which appropriate flag bits any of:
 615         CODING_CATEGORY_MASK_ISO_7
 616         CODING_CATEGORY_MASK_ISO_8_1
 617         CODING_CATEGORY_MASK_ISO_8_2
 618         CODING_CATEGORY_MASK_ISO_7_ELSE
 619         CODING_CATEGORY_MASK_ISO_8_ELSE
 620    are set.  If a code which should never appear in ISO2022 is found,
 621    returns 0.  */
 622
 623 int
 624 detect_coding_iso2022 (src, src_end)
 625      unsigned char *src, *src_end;
 626 {
 627   int mask = (CODING_CATEGORY_MASK_ISO_7
 628               | CODING_CATEGORY_MASK_ISO_8_1
 629               | CODING_CATEGORY_MASK_ISO_8_2
 630               | CODING_CATEGORY_MASK_ISO_7_ELSE
 631               | CODING_CATEGORY_MASK_ISO_8_ELSE
 632               );
 633   int g1 = 0;                   /* 1 iff designating to G1.  */
 634   int c, i;
 635   struct coding_system coding_iso_8_1, coding_iso_8_2;
 636
 637   /* Coding systems of these categories may accept latin extra codes.  */
 638   setup_coding_system
 639     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
 640      &coding_iso_8_1);
 641   setup_coding_system
 642     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
 643      &coding_iso_8_2);
 644
 645   while (mask && src < src_end)
 646     {
 647       c = *src++;
 648       switch (c)
 649         {
 650         case ISO_CODE_ESC:
 651           if (src >= src_end)
 652             break;
 653           c = *src++;
 654           if ((c >= '(' && c <= '/'))
 655             {
 656               /* Designation sequence for a charset of dimension 1.  */
 657               if (src >= src_end)
 658                 break;
 659               c = *src++;
 660               if (c < ' ' || c >= 0x80)
 661                 /* Invalid designation sequence.  */
 662                 return 0;
 663             }
 664           else if (c == '$')
 665             {
 666               /* Designation sequence for a charset of dimension 2.  */
 667               if (src >= src_end)
 668                 break;
 669               c = *src++;
 670               if (c >= '@' && c <= 'B')
 671                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 672                 ;
 673               else if (c >= '(' && c <= '/')
 674                 {
 675                   if (src >= src_end)
 676                     break;
 677                   c = *src++;
 678                   if (c < ' ' || c >= 0x80)
 679                     /* Invalid designation sequence.  */
 680                     return 0;
 681                 }
 682               else
 683                 /* Invalid designation sequence.  */
 684                 return 0;
 685             }
 686           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 687             /* Locking shift.  */
 688             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 689                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 690           else if (c == '0' || c == '1' || c == '2')
 691             /* Start/end composition.  */
 692             ;
 693           else
 694             /* Invalid escape sequence.  */
 695             return 0;
 696           break;
 697
 698         case ISO_CODE_SO:
 699           mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 700                    | CODING_CATEGORY_MASK_ISO_8_ELSE);
 701           break;
 702
 703         case ISO_CODE_CSI:
 704         case ISO_CODE_SS2:
 705         case ISO_CODE_SS3:
 706           {
 707             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 708
 709             if (c != ISO_CODE_CSI)
 710               {
 711                 if (coding_iso_8_1.flags & CODING_FLAG_ISO_SINGLE_SHIFT)
 712                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 713                 if (coding_iso_8_2.flags & CODING_FLAG_ISO_SINGLE_SHIFT)
 714                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 715               }
 716             if (VECTORP (Vlatin_extra_code_table)
 717                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 718               {
 719                 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 720                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 721                 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 722                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 723               }
 724             mask &= newmask;
 725           }
 726           break;
 727
 728         default:
 729           if (c < 0x80)
 730             break;
 731           else if (c < 0xA0)
 732             {
 733               if (VECTORP (Vlatin_extra_code_table)
 734                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 735                 {
 736                   int newmask = 0;
 737
 738                   if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 739                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 740                   if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 741                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 742                   mask &= newmask;
 743                 }
 744               else
 745                 return 0;
 746             }
 747           else
 748             {
 749               unsigned char *src_begin = src;
 750
 751               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 752                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 753               while (src < src_end && *src >= 0xA0)
 754                 src++;
 755               if ((src - src_begin - 1) & 1 && src < src_end)
 756                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 757             }
 758           break;
 759         }
 760     }
 761
 762   return mask;
 763 }
 764
 765 /* Decode a character of which charset is CHARSET and the 1st position
 766    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 767    fetched from SRC and set to C2.  If CHARSET is negative, it means
 768    that we are decoding ill formed text, and what we can do is just to
 769    read C1 as is.  */
 770
 771 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 772   do {                                                                  \
 773     int c_alt, charset_alt = (charset);                                 \
 774     if (COMPOSING_HEAD_P (coding->composing))                           \
 775       {                                                                 \
 776         *dst++ = LEADING_CODE_COMPOSITION;                              \
 777         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 778           /* To tell composition rules are embeded.  */                 \
 779           *dst++ = 0xFF;                                                \
 780         coding->composing += 2;                                         \
 781       }                                                                 \
 782     if ((charset) >= 0)                                                 \
 783       {                                                                 \
 784         if (CHARSET_DIMENSION (charset) == 2)                           \
 785           {                                                             \
 786             ONE_MORE_BYTE (c2);                                         \
 787             if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F         \
 788                 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0)  \
 789               {                                                         \
 790                 src--;                                                  \
 791                 c2 = ' ';                                               \
 792               }                                                         \
 793           }                                                             \
 794         if (!NILP (unification_table)                                   \
 795             && ((c_alt = unify_char (unification_table,                 \
 796                                      -1, (charset), c1, c2)) >= 0))     \
 797           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 798       }                                                                 \
 799     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 800       DECODE_CHARACTER_ASCII (c1);                                      \
 801     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 802       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 803     else                                                                \
 804       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 805     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 806       /* To tell a composition rule follows.  */                        \
 807       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 808   } while (0)
 809
 810 /* Set designation state into CODING.  */
 811 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 812   do {                                                                  \
 813     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 814                                      make_number (chars),               \
 815                                      make_number (final_char));         \
 816     if (charset >= 0)                                                   \
 817       {                                                                 \
 818         if (coding->direction == 1                                      \
 819             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 820           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 821         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 822       }                                                                 \
 823   } while (0)
 824
 825 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 826
 827 int
 828 decode_coding_iso2022 (coding, source, destination,
 829                        src_bytes, dst_bytes, consumed)
 830      struct coding_system *coding;
 831      unsigned char *source, *destination;
 832      int src_bytes, dst_bytes;
 833      int *consumed;
 834 {
 835   unsigned char *src = source;
 836   unsigned char *src_end = source + src_bytes;
 837   unsigned char *dst = destination;
 838   unsigned char *dst_end = destination + dst_bytes;
 839   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 840      from DST_END to assure that overflow checking is necessary only
 841      at the head of loop.  */
 842   unsigned char *adjusted_dst_end = dst_end - 6;
 843   int charset;
 844   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 845   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 846   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 847   Lisp_Object unification_table
 848       = coding->character_unification_table_for_decode;
 849
 850   if (!NILP (Venable_character_unification) && NILP (unification_table))
 851     unification_table = Vstandard_character_unification_table_for_decode;
 852
 853   while (src < src_end && dst < adjusted_dst_end)
 854     {
 855       /* SRC_BASE remembers the start position in source in each loop.
 856          The loop will be exited when there's not enough source text
 857          to analyze long escape sequence or 2-byte code (within macros
 858          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 859          to SRC_BASE before exiting.  */
 860       unsigned char *src_base = src;
 861       int c1 = *src++, c2;
 862
 863       switch (iso_code_class [c1])
 864         {
 865         case ISO_0x20_or_0x7F:
 866           if (!coding->composing
 867               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 868             {
 869               /* This is SPACE or DEL.  */
 870               *dst++ = c1;
 871               break;
 872             }
 873           /* This is a graphic character, we fall down ...  */
 874
 875         case ISO_graphic_plane_0:
 876           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 877             {
 878               /* This is a composition rule.  */
 879               *dst++ = c1 | 0x80;
 880               coding->composing = COMPOSING_WITH_RULE_TAIL;
 881             }
 882           else
 883             DECODE_ISO_CHARACTER (charset0, c1);
 884           break;
 885
 886         case ISO_0xA0_or_0xFF:
 887           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 888             {
 889               /* Invalid code.  */
 890               *dst++ = c1;
 891               break;
 892             }
 893           /* This is a graphic character, we fall down ... */
 894
 895         case ISO_graphic_plane_1:
 896           DECODE_ISO_CHARACTER (charset1, c1);
 897           break;
 898
 899         case ISO_control_code:
 900           /* All ISO2022 control characters in this class have the
 901              same representation in Emacs internal format.  */
 902           *dst++ = c1;
 903           break;
 904
 905         case ISO_carriage_return:
 906           if (coding->eol_type == CODING_EOL_CR)
 907             {
 908               *dst++ = '\n';
 909             }
 910           else if (coding->eol_type == CODING_EOL_CRLF)
 911             {
 912               ONE_MORE_BYTE (c1);
 913               if (c1 == ISO_CODE_LF)
 914                 *dst++ = '\n';
 915               else
 916                 {
 917                   src--;
 918                   *dst++ = c1;
 919                 }
 920             }
 921           else
 922             {
 923               *dst++ = c1;
 924             }
 925           break;
 926
 927         case ISO_shift_out:
 928           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 929             goto label_invalid_escape_sequence;
 930           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 931           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 932           break;
 933
 934         case ISO_shift_in:
 935           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 936           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 937           break;
 938
 939         case ISO_single_shift_2_7:
 940         case ISO_single_shift_2:
 941           /* SS2 is handled as an escape sequence of ESC 'N' */
 942           c1 = 'N';
 943           goto label_escape_sequence;
 944
 945         case ISO_single_shift_3:
 946           /* SS2 is handled as an escape sequence of ESC 'O' */
 947           c1 = 'O';
 948           goto label_escape_sequence;
 949
 950         case ISO_control_sequence_introducer:
 951           /* CSI is handled as an escape sequence of ESC '[' ...  */
 952           c1 = '[';
 953           goto label_escape_sequence;
 954
 955         case ISO_escape:
 956           ONE_MORE_BYTE (c1);
 957         label_escape_sequence:
 958           /* Escape sequences handled by Emacs are invocation,
 959              designation, direction specification, and character
 960              composition specification.  */
 961           switch (c1)
 962             {
 963             case '&':           /* revision of following character set */
 964               ONE_MORE_BYTE (c1);
 965               if (!(c1 >= '@' && c1 <= '~'))
 966                 goto label_invalid_escape_sequence;
 967               ONE_MORE_BYTE (c1);
 968               if (c1 != ISO_CODE_ESC)
 969                 goto label_invalid_escape_sequence;
 970               ONE_MORE_BYTE (c1);
 971               goto label_escape_sequence;
 972
 973             case '$':           /* designation of 2-byte character set */
 974               ONE_MORE_BYTE (c1);
 975               if (c1 >= '@' && c1 <= 'B')
 976                 {       /* designation of JISX0208.1978, GB2312.1980,
 977                                    or JISX0208.1980 */
 978                   DECODE_DESIGNATION (0, 2, 94, c1);
 979                 }
 980               else if (c1 >= 0x28 && c1 <= 0x2B)
 981                 {       /* designation of DIMENSION2_CHARS94 character set */
 982                   ONE_MORE_BYTE (c2);
 983                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 984                 }
 985               else if (c1 >= 0x2C && c1 <= 0x2F)
 986                 {       /* designation of DIMENSION2_CHARS96 character set */
 987                   ONE_MORE_BYTE (c2);
 988                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 989                 }
 990               else
 991                 goto label_invalid_escape_sequence;
 992               break;
 993
 994             case 'n':           /* invocation of locking-shift-2 */
 995               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 996                 goto label_invalid_escape_sequence;
 997               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 998               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 999               break;
1000
1001             case 'o':           /* invocation of locking-shift-3 */
1002               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1003                 goto label_invalid_escape_sequence;
1004               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1005               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1006               break;
1007
1008             case 'N':           /* invocation of single-shift-2 */
1009               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1010                 goto label_invalid_escape_sequence;
1011               ONE_MORE_BYTE (c1);
1012               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1013               DECODE_ISO_CHARACTER (charset, c1);
1014               break;
1015
1016             case 'O':           /* invocation of single-shift-3 */
1017               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1018                 goto label_invalid_escape_sequence;
1019               ONE_MORE_BYTE (c1);
1020               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1021               DECODE_ISO_CHARACTER (charset, c1);
1022               break;
1023
1024             case '0':           /* start composing without embeded rules */
1025               coding->composing = COMPOSING_NO_RULE_HEAD;
1026               break;
1027
1028             case '1':           /* end composing */
1029               coding->composing = COMPOSING_NO;
1030               break;
1031
1032             case '2':           /* start composing with embeded rules */
1033               coding->composing = COMPOSING_WITH_RULE_HEAD;
1034               break;
1035
1036             case '[':           /* specification of direction */
1037               /* For the moment, nested direction is not supported.
1038                  So, the value of `coding->direction' is 0 or 1: 0
1039                  means left-to-right, 1 means right-to-left.  */
1040               ONE_MORE_BYTE (c1);
1041               switch (c1)
1042                 {
1043                 case ']':       /* end of the current direction */
1044                   coding->direction = 0;
1045
1046                 case '0':       /* end of the current direction */
1047                 case '1':       /* start of left-to-right direction */
1048                   ONE_MORE_BYTE (c1);
1049                   if (c1 == ']')
1050                     coding->direction = 0;
1051                   else
1052                     goto label_invalid_escape_sequence;
1053                   break;
1054
1055                 case '2':       /* start of right-to-left direction */
1056                   ONE_MORE_BYTE (c1);
1057                   if (c1 == ']')
1058                     coding->direction= 1;
1059                   else
1060                     goto label_invalid_escape_sequence;
1061                   break;
1062
1063                 default:
1064                   goto label_invalid_escape_sequence;
1065                 }
1066               break;
1067
1068             default:
1069               if (c1 >= 0x28 && c1 <= 0x2B)
1070                 {       /* designation of DIMENSION1_CHARS94 character set */
1071                   ONE_MORE_BYTE (c2);
1072                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1073                 }
1074               else if (c1 >= 0x2C && c1 <= 0x2F)
1075                 {       /* designation of DIMENSION1_CHARS96 character set */
1076                   ONE_MORE_BYTE (c2);
1077                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1078                 }
1079               else
1080                 {
1081                   goto label_invalid_escape_sequence;
1082                 }
1083             }
1084           /* We must update these variables now.  */
1085           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1086           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1087           break;
1088
1089         label_invalid_escape_sequence:
1090           {
1091             int length = src - src_base;
1092
1093             bcopy (src_base, dst, length);
1094             dst += length;
1095           }
1096         }
1097       continue;
1098
1099     label_end_of_loop:
1100       coding->carryover_size = src - src_base;
1101       bcopy (src_base, coding->carryover, coding->carryover_size);
1102       src = src_base;
1103       break;
1104     }
1105
1106   /* If this is the last block of the text to be decoded, we had
1107      better just flush out all remaining codes in the text although
1108      they are not valid characters.  */
1109   if (coding->last_block)
1110     {
1111       bcopy (src, dst, src_end - src);
1112       dst += (src_end - src);
1113       src = src_end;
1114     }
1115   *consumed = src - source;
1116   return dst - destination;
1117 }
1118
1119 /* ISO2022 encoding stuff.  */
1120
1121 /*
1122    It is not enough to say just "ISO2022" on encoding, we have to
1123    specify more details.  In Emacs, each coding-system of ISO2022
1124    variant has the following specifications:
1125         1. Initial designation to G0 thru G3.
1126         2. Allows short-form designation?
1127         3. ASCII should be designated to G0 before control characters?
1128         4. ASCII should be designated to G0 at end of line?
1129         5. 7-bit environment or 8-bit environment?
1130         6. Use locking-shift?
1131         7. Use Single-shift?
1132    And the following two are only for Japanese:
1133         8. Use ASCII in place of JIS0201-1976-Roman?
1134         9. Use JISX0208-1983 in place of JISX0208-1978?
1135    These specifications are encoded in `coding->flags' as flag bits
1136    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1137    details.
1138 */
1139
1140 /* Produce codes (escape sequence) for designating CHARSET to graphic
1141    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1142    the coding system CODING allows, produce designation sequence of
1143    short-form.  */
1144
1145 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1146   do {                                                                  \
1147     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1148     char *intermediate_char_94 = "()*+";                                \
1149     char *intermediate_char_96 = ",-./";                                \
1150     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1151     if (revision < 255)                                                 \
1152       {                                                                 \
1153         *dst++ = ISO_CODE_ESC;                                          \
1154         *dst++ = '&';                                                   \
1155         *dst++ = '@' + revision;                                        \
1156       }                                                                 \
1157     *dst++ = ISO_CODE_ESC;                                              \
1158     if (CHARSET_DIMENSION (charset) == 1)                               \
1159       {                                                                 \
1160         if (CHARSET_CHARS (charset) == 94)                              \
1161           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1162         else                                                            \
1163           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1164       }                                                                 \
1165     else                                                                \
1166       {                                                                 \
1167         *dst++ = '$';                                                   \
1168         if (CHARSET_CHARS (charset) == 94)                              \
1169           {                                                             \
1170             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1171                 || reg != 0                                             \
1172                 || final_char < '@' || final_char > 'B')                \
1173               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1174           }                                                             \
1175         else                                                            \
1176           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1177       }                                                                 \
1178     *dst++ = final_char;                                                \
1179     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1180   } while (0)
1181
1182 /* The following two macros produce codes (control character or escape
1183    sequence) for ISO2022 single-shift functions (single-shift-2 and
1184    single-shift-3).  */
1185
1186 #define ENCODE_SINGLE_SHIFT_2                           \
1187   do {                                                  \
1188     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1189       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1190     else                                                \
1191       *dst++ = ISO_CODE_SS2;                            \
1192     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1193   } while (0)
1194
1195 #define ENCODE_SINGLE_SHIFT_3                           \
1196   do {                                                  \
1197     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1198       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1199     else                                                \
1200       *dst++ = ISO_CODE_SS3;                            \
1201     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1202   } while (0)
1203
1204 /* The following four macros produce codes (control character or
1205    escape sequence) for ISO2022 locking-shift functions (shift-in,
1206    shift-out, locking-shift-2, and locking-shift-3).  */
1207
1208 #define ENCODE_SHIFT_IN                         \
1209   do {                                          \
1210     *dst++ = ISO_CODE_SI;                       \
1211     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1212   } while (0)
1213
1214 #define ENCODE_SHIFT_OUT                        \
1215   do {                                          \
1216     *dst++ = ISO_CODE_SO;                       \
1217     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1218   } while (0)
1219
1220 #define ENCODE_LOCKING_SHIFT_2                  \
1221   do {                                          \
1222     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1223     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1224   } while (0)
1225
1226 #define ENCODE_LOCKING_SHIFT_3                  \
1227   do {                                          \
1228     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1229     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1230   } while (0)
1231
1232 /* Produce codes for a DIMENSION1 character whose character set is
1233    CHARSET and whose position-code is C1.  Designation and invocation
1234    sequences are also produced in advance if necessary.  */
1235
1236
1237 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1238   do {                                                                  \
1239     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1240       {                                                                 \
1241         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1242           *dst++ = c1 & 0x7F;                                           \
1243         else                                                            \
1244           *dst++ = c1 | 0x80;                                           \
1245         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1246         break;                                                          \
1247       }                                                                 \
1248     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1249       {                                                                 \
1250         *dst++ = c1 & 0x7F;                                             \
1251         break;                                                          \
1252       }                                                                 \
1253     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1254       {                                                                 \
1255         *dst++ = c1 | 0x80;                                             \
1256         break;                                                          \
1257       }                                                                 \
1258     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1259              && !coding->safe_charsets[charset])                        \
1260       {                                                                 \
1261         /* We should not encode this character, instead produce one or  \
1262            two `?'s.  */                                                \
1263         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1264         if (CHARSET_WIDTH (charset) == 2)                               \
1265           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1266         break;                                                          \
1267       }                                                                 \
1268     else                                                                \
1269       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1270          must invoke it, or, at first, designate it to some graphic     \
1271          register.  Then repeat the loop to actually produce the        \
1272          character.  */                                                 \
1273       dst = encode_invocation_designation (charset, coding, dst);       \
1274   } while (1)
1275
1276 /* Produce codes for a DIMENSION2 character whose character set is
1277    CHARSET and whose position-codes are C1 and C2.  Designation and
1278    invocation codes are also produced in advance if necessary.  */
1279
1280 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1281   do {                                                                  \
1282     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1283       {                                                                 \
1284         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1285           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1286         else                                                            \
1287           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1288         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1289         break;                                                          \
1290       }                                                                 \
1291     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1292       {                                                                 \
1293         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1294         break;                                                          \
1295       }                                                                 \
1296     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1297       {                                                                 \
1298         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1299         break;                                                          \
1300       }                                                                 \
1301     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1302              && !coding->safe_charsets[charset])                        \
1303       {                                                                 \
1304         /* We should not encode this character, instead produce one or  \
1305            two `?'s.  */                                                \
1306         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1307         if (CHARSET_WIDTH (charset) == 2)                               \
1308           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1309         break;                                                          \
1310       }                                                                 \
1311     else                                                                \
1312       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1313          must invoke it, or, at first, designate it to some graphic     \
1314          register.  Then repeat the loop to actually produce the        \
1315          character.  */                                                 \
1316       dst = encode_invocation_designation (charset, coding, dst);       \
1317   } while (1)
1318
1319 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1320   do {                                                                    \
1321     int c_alt, charset_alt;                                               \
1322     if (!NILP (unification_table)                                         \
1323         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1324             >= 0))                                                        \
1325       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1326     else                                                                  \
1327       charset_alt = charset;                                              \
1328     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1329       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1330     else                                                                  \
1331       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1332   } while (0)
1333
1334 /* Produce designation and invocation codes at a place pointed by DST
1335    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1336    Return new DST.  */
1337
1338 unsigned char *
1339 encode_invocation_designation (charset, coding, dst)
1340      int charset;
1341      struct coding_system *coding;
1342      unsigned char *dst;
1343 {
1344   int reg;                      /* graphic register number */
1345
1346   /* At first, check designations.  */
1347   for (reg = 0; reg < 4; reg++)
1348     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1349       break;
1350
1351   if (reg >= 4)
1352     {
1353       /* CHARSET is not yet designated to any graphic registers.  */
1354       /* At first check the requested designation.  */
1355       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1356       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1357         /* Since CHARSET requests no special designation, designate it
1358            to graphic register 0.  */
1359         reg = 0;
1360
1361       ENCODE_DESIGNATION (charset, reg, coding);
1362     }
1363
1364   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1365       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1366     {
1367       /* Since the graphic register REG is not invoked to any graphic
1368          planes, invoke it to graphic plane 0.  */
1369       switch (reg)
1370         {
1371         case 0:                 /* graphic register 0 */
1372           ENCODE_SHIFT_IN;
1373           break;
1374
1375         case 1:                 /* graphic register 1 */
1376           ENCODE_SHIFT_OUT;
1377           break;
1378
1379         case 2:                 /* graphic register 2 */
1380           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1381             ENCODE_SINGLE_SHIFT_2;
1382           else
1383             ENCODE_LOCKING_SHIFT_2;
1384           break;
1385
1386         case 3:                 /* graphic register 3 */
1387           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1388             ENCODE_SINGLE_SHIFT_3;
1389           else
1390             ENCODE_LOCKING_SHIFT_3;
1391           break;
1392         }
1393     }
1394   return dst;
1395 }
1396
1397 /* The following two macros produce codes for indicating composition.  */
1398 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1399 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1400 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1401
1402 /* The following three macros produce codes for indicating direction
1403    of text.  */
1404 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1405   do {                                                  \
1406     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1407       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1408     else                                                \
1409       *dst++ = ISO_CODE_CSI;                            \
1410   } while (0)
1411
1412 #define ENCODE_DIRECTION_R2L    \
1413   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1414
1415 #define ENCODE_DIRECTION_L2R    \
1416   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1417
1418 /* Produce codes for designation and invocation to reset the graphic
1419    planes and registers to initial state.  */
1420 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1421   do {                                                                      \
1422     int reg;                                                                \
1423     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1424       ENCODE_SHIFT_IN;                                                      \
1425     for (reg = 0; reg < 4; reg++)                                           \
1426       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1427           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1428               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1429         ENCODE_DESIGNATION                                                  \
1430           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1431   } while (0)
1432
1433 /* Produce designation sequences of charsets in the line started from
1434    *SRC to a place pointed by DSTP.
1435
1436    If the current block ends before any end-of-line, we may fail to
1437    find all the necessary *designations.  */
1438 encode_designation_at_bol (coding, table, src, src_end, dstp)
1439      struct coding_system *coding;
1440      Lisp_Object table;
1441      unsigned char *src, *src_end, **dstp;
1442 {
1443   int charset, c, found = 0, reg;
1444   /* Table of charsets to be designated to each graphic register.  */
1445   int r[4];
1446   unsigned char *dst = *dstp;
1447
1448   for (reg = 0; reg < 4; reg++)
1449     r[reg] = -1;
1450
1451   while (src < src_end && *src != '\n' && found < 4)
1452     {
1453       int bytes = BYTES_BY_CHAR_HEAD (*src);
1454
1455       if (NILP (table))
1456         charset = CHARSET_AT (src);
1457       else
1458         {
1459           int c_alt;
1460           unsigned char c1, c2;
1461
1462           SPLIT_STRING(src, bytes, charset, c1, c2);
1463           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1464             charset = CHAR_CHARSET (c_alt);
1465         }
1466
1467       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1468       if (r[reg] < 0)
1469         {
1470           found++;
1471           r[reg] = charset;
1472         }
1473
1474       src += bytes;
1475     }
1476
1477   if (found)
1478     {
1479       for (reg = 0; reg < 4; reg++)
1480         if (r[reg] >= 0
1481             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1482           ENCODE_DESIGNATION (r[reg], reg, coding);
1483       *dstp = dst;
1484     }
1485 }
1486
1487 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1488
1489 int
1490 encode_coding_iso2022 (coding, source, destination,
1491                        src_bytes, dst_bytes, consumed)
1492      struct coding_system *coding;
1493      unsigned char *source, *destination;
1494      int src_bytes, dst_bytes;
1495      int *consumed;
1496 {
1497   unsigned char *src = source;
1498   unsigned char *src_end = source + src_bytes;
1499   unsigned char *dst = destination;
1500   unsigned char *dst_end = destination + dst_bytes;
1501   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1502      from DST_END to assure overflow checking is necessary only at the
1503      head of loop.  */
1504   unsigned char *adjusted_dst_end = dst_end - 19;
1505   Lisp_Object unification_table
1506       = coding->character_unification_table_for_encode;
1507
1508   if (!NILP (Venable_character_unification) && NILP (unification_table))
1509     unification_table = Vstandard_character_unification_table_for_encode;
1510
1511   while (src < src_end && dst < adjusted_dst_end)
1512     {
1513       /* SRC_BASE remembers the start position in source in each loop.
1514          The loop will be exited when there's not enough source text
1515          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1516          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1517          reset to SRC_BASE before exiting.  */
1518       unsigned char *src_base = src;
1519       int charset, c1, c2, c3, c4;
1520
1521       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1522           && CODING_SPEC_ISO_BOL (coding))
1523         {
1524           /* We have to produce designation sequences if any now.  */
1525           encode_designation_at_bol (coding, unification_table,
1526                                      src, src_end, &dst);
1527           CODING_SPEC_ISO_BOL (coding) = 0;
1528         }
1529
1530       c1 = *src++;
1531       /* If we are seeing a component of a composite character, we are
1532          seeing a leading-code specially encoded for composition, or a
1533          composition rule if composing with rule.  We must set C1
1534          to a normal leading-code or an ASCII code.  If we are not at
1535          a composed character, we must reset the composition state.  */
1536       if (COMPOSING_P (coding->composing))
1537         {
1538           if (c1 < 0xA0)
1539             {
1540               /* We are not in a composite character any longer.  */
1541               coding->composing = COMPOSING_NO;
1542               ENCODE_COMPOSITION_END;
1543             }
1544           else
1545             {
1546               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1547                 {
1548                   *dst++ = c1 & 0x7F;
1549                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1550                   continue;
1551                 }
1552               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1553                 coding->composing = COMPOSING_WITH_RULE_RULE;
1554               if (c1 == 0xA0)
1555                 {
1556                   /* This is an ASCII component.  */
1557                   ONE_MORE_BYTE (c1);
1558                   c1 &= 0x7F;
1559                 }
1560               else
1561                 /* This is a leading-code of non ASCII component.  */
1562                 c1 -= 0x20;
1563             }
1564         }
1565
1566       /* Now encode one character.  C1 is a control character, an
1567          ASCII character, or a leading-code of multi-byte character.  */
1568       switch (emacs_code_class[c1])
1569         {
1570         case EMACS_ascii_code:
1571           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1572           break;
1573
1574         case EMACS_control_code:
1575           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1576             ENCODE_RESET_PLANE_AND_REGISTER;
1577           *dst++ = c1;
1578           break;
1579
1580         case EMACS_carriage_return_code:
1581           if (!coding->selective)
1582             {
1583               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1584                 ENCODE_RESET_PLANE_AND_REGISTER;
1585               *dst++ = c1;
1586               break;
1587             }
1588           /* fall down to treat '\r' as '\n' ...  */
1589
1590         case EMACS_linefeed_code:
1591           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1592             ENCODE_RESET_PLANE_AND_REGISTER;
1593           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1594             bcopy (coding->spec.iso2022.initial_designation,
1595                    coding->spec.iso2022.current_designation,
1596                    sizeof coding->spec.iso2022.initial_designation);
1597           if (coding->eol_type == CODING_EOL_LF
1598               || coding->eol_type == CODING_EOL_UNDECIDED)
1599             *dst++ = ISO_CODE_LF;
1600           else if (coding->eol_type == CODING_EOL_CRLF)
1601             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1602           else
1603             *dst++ = ISO_CODE_CR;
1604           CODING_SPEC_ISO_BOL (coding) = 1;
1605           break;
1606
1607         case EMACS_leading_code_2:
1608           ONE_MORE_BYTE (c2);
1609           if (c2 < 0xA0)
1610             {
1611               /* invalid sequence */
1612               *dst++ = c1;
1613               *dst++ = c2;
1614             }
1615           else
1616             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1617           break;
1618
1619         case EMACS_leading_code_3:
1620           TWO_MORE_BYTES (c2, c3);
1621           if (c2 < 0xA0 || c3 < 0xA0)
1622             {
1623               /* invalid sequence */
1624               *dst++ = c1;
1625               *dst++ = c2;
1626               *dst++ = c3;
1627             }
1628           else if (c1 < LEADING_CODE_PRIVATE_11)
1629             ENCODE_ISO_CHARACTER (c1, c2, c3);
1630           else
1631             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1632           break;
1633
1634         case EMACS_leading_code_4:
1635           THREE_MORE_BYTES (c2, c3, c4);
1636           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1637             {
1638               /* invalid sequence */
1639               *dst++ = c1;
1640               *dst++ = c2;
1641               *dst++ = c3;
1642               *dst++ = c4;
1643             }
1644           else
1645             ENCODE_ISO_CHARACTER (c2, c3, c4);
1646           break;
1647
1648         case EMACS_leading_code_composition:
1649           ONE_MORE_BYTE (c2);
1650           if (c2 < 0xA0)
1651             {
1652               /* invalid sequence */
1653               *dst++ = c1;
1654               *dst++ = c2;
1655             }
1656           else if (c2 == 0xFF)
1657             {
1658               coding->composing = COMPOSING_WITH_RULE_HEAD;
1659               ENCODE_COMPOSITION_WITH_RULE_START;
1660             }
1661           else
1662             {
1663               /* Rewind one byte because it is a character code of
1664                  composition elements.  */
1665               src--;
1666               coding->composing = COMPOSING_NO_RULE_HEAD;
1667               ENCODE_COMPOSITION_NO_RULE_START;
1668             }
1669           break;
1670
1671         case EMACS_invalid_code:
1672           *dst++ = c1;
1673           break;
1674         }
1675       continue;
1676     label_end_of_loop:
1677       /* We reach here because the source date ends not at character
1678          boundary.  */
1679       coding->carryover_size = src_end - src_base;
1680       bcopy (src_base, coding->carryover, coding->carryover_size);
1681       src = src_end;
1682       break;
1683     }
1684
1685   /* If this is the last block of the text to be encoded, we must
1686      reset graphic planes and registers to the initial state.  */
1687   if (src >= src_end && coding->last_block)
1688     {
1689       ENCODE_RESET_PLANE_AND_REGISTER;
1690       if (coding->carryover_size > 0
1691           && coding->carryover_size < (dst_end - dst))
1692         {
1693           bcopy (coding->carryover, dst, coding->carryover_size);
1694           dst += coding->carryover_size;
1695           coding->carryover_size = 0;
1696         }
1697     }
1698   *consumed = src - source;
1699   return dst - destination;
1700 }
1701
1702 \f
1703 /*** 4. SJIS and BIG5 handlers ***/
1704
1705 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1706    quite widely.  So, for the moment, Emacs supports them in the bare
1707    C code.  But, in the future, they may be supported only by CCL.  */
1708
1709 /* SJIS is a coding system encoding three character sets: ASCII, right
1710    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1711    as is.  A character of charset katakana-jisx0201 is encoded by
1712    "position-code + 0x80".  A character of charset japanese-jisx0208
1713    is encoded in 2-byte but two position-codes are divided and shifted
1714    so that it fit in the range below.
1715
1716    --- CODE RANGE of SJIS ---
1717    (character set)      (range)
1718    ASCII                0x00 .. 0x7F
1719    KATAKANA-JISX0201    0xA0 .. 0xDF
1720    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1721             (2nd byte)  0x40 .. 0xFF
1722    -------------------------------
1723
1724 */
1725
1726 /* BIG5 is a coding system encoding two character sets: ASCII and
1727    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1728    character set and is encoded in two-byte.
1729
1730    --- CODE RANGE of BIG5 ---
1731    (character set)      (range)
1732    ASCII                0x00 .. 0x7F
1733    Big5 (1st byte)      0xA1 .. 0xFE
1734         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1735    --------------------------
1736
1737    Since the number of characters in Big5 is larger than maximum
1738    characters in Emacs' charset (96x96), it can't be handled as one
1739    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1740    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1741    contains frequently used characters and the latter contains less
1742    frequently used characters.  */
1743
1744 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1745    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1746    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1747    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1748
1749 /* Number of Big5 characters which have the same code in 1st byte.  */
1750 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1751
1752 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1753   do {                                                                  \
1754     unsigned int temp                                                   \
1755       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1756     if (b1 < 0xC9)                                                      \
1757       charset = charset_big5_1;                                         \
1758     else                                                                \
1759       {                                                                 \
1760         charset = charset_big5_2;                                       \
1761         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1762       }                                                                 \
1763     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1764     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1765   } while (0)
1766
1767 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1768   do {                                                                  \
1769     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1770     if (charset == charset_big5_2)                                      \
1771       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1772     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1773     b2 = temp % BIG5_SAME_ROW;                                          \
1774     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1775   } while (0)
1776
1777 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1778   do {                                                                  \
1779     int c_alt, charset_alt = (charset);                                 \
1780     if (!NILP (unification_table)                                       \
1781         && ((c_alt = unify_char (unification_table,                     \
1782                                  -1, (charset), c1, c2)) >= 0))         \
1783           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1784     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1785       DECODE_CHARACTER_ASCII (c1);                                      \
1786     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1787       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1788     else                                                                \
1789       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1790   } while (0)
1791
1792 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1793   do {                                                                    \
1794     int c_alt, charset_alt;                                               \
1795     if (!NILP (unification_table)                                         \
1796         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1797             >= 0))                                                        \
1798       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1799     else                                                                  \
1800       charset_alt = charset;                                              \
1801     if (charset_alt == charset_ascii)                                     \
1802       *dst++ = c1;                                                        \
1803     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1804       {                                                                   \
1805         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1806           *dst++ = c1;                                                    \
1807         else                                                              \
1808           *dst++ = charset_alt, *dst++ = c1;                              \
1809       }                                                                   \
1810     else                                                                  \
1811       {                                                                   \
1812         c1 &= 0x7F, c2 &= 0x7F;                                           \
1813         if (sjis_p && charset_alt == charset_jisx0208)                    \
1814           {                                                               \
1815             unsigned char s1, s2;                                         \
1816                                                                           \
1817             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1818             *dst++ = s1, *dst++ = s2;                                     \
1819           }                                                               \
1820         else if (!sjis_p                                                  \
1821                  && (charset_alt == charset_big5_1                        \
1822                      || charset_alt == charset_big5_2))                   \
1823           {                                                               \
1824             unsigned char b1, b2;                                         \
1825                                                                           \
1826             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1827             *dst++ = b1, *dst++ = b2;                                     \
1828           }                                                               \
1829         else                                                              \
1830           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1831       }                                                                   \
1832   } while (0);
1833
1834 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1835    Check if a text is encoded in SJIS.  If it is, return
1836    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1837
1838 int
1839 detect_coding_sjis (src, src_end)
1840      unsigned char *src, *src_end;
1841 {
1842   unsigned char c;
1843
1844   while (src < src_end)
1845     {
1846       c = *src++;
1847       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1848         return 0;
1849       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1850         {
1851           if (src < src_end && *src++ < 0x40)
1852             return 0;
1853         }
1854     }
1855   return CODING_CATEGORY_MASK_SJIS;
1856 }
1857
1858 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1859    Check if a text is encoded in BIG5.  If it is, return
1860    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1861
1862 int
1863 detect_coding_big5 (src, src_end)
1864      unsigned char *src, *src_end;
1865 {
1866   unsigned char c;
1867
1868   while (src < src_end)
1869     {
1870       c = *src++;
1871       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1872         return 0;
1873       if (c >= 0xA1)
1874         {
1875           if (src >= src_end)
1876             break;
1877           c = *src++;
1878           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1879             return 0;
1880         }
1881     }
1882   return CODING_CATEGORY_MASK_BIG5;
1883 }
1884
1885 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1886    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1887
1888 int
1889 decode_coding_sjis_big5 (coding, source, destination,
1890                          src_bytes, dst_bytes, consumed, sjis_p)
1891      struct coding_system *coding;
1892      unsigned char *source, *destination;
1893      int src_bytes, dst_bytes;
1894      int *consumed;
1895      int sjis_p;
1896 {
1897   unsigned char *src = source;
1898   unsigned char *src_end = source + src_bytes;
1899   unsigned char *dst = destination;
1900   unsigned char *dst_end = destination + dst_bytes;
1901   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1902      from DST_END to assure overflow checking is necessary only at the
1903      head of loop.  */
1904   unsigned char *adjusted_dst_end = dst_end - 3;
1905   Lisp_Object unification_table
1906       = coding->character_unification_table_for_decode;
1907
1908   if (!NILP (Venable_character_unification) && NILP (unification_table))
1909     unification_table = Vstandard_character_unification_table_for_decode;
1910
1911   while (src < src_end && dst < adjusted_dst_end)
1912     {
1913       /* SRC_BASE remembers the start position in source in each loop.
1914          The loop will be exited when there's not enough source text
1915          to analyze two-byte character (within macro ONE_MORE_BYTE).
1916          In that case, SRC is reset to SRC_BASE before exiting.  */
1917       unsigned char *src_base = src;
1918       unsigned char c1 = *src++, c2, c3, c4;
1919
1920       if (c1 == '\r')
1921         {
1922           if (coding->eol_type == CODING_EOL_CRLF)
1923             {
1924               ONE_MORE_BYTE (c2);
1925               if (c2 == '\n')
1926                 *dst++ = c2;
1927               else
1928                 /* To process C2 again, SRC is subtracted by 1.  */
1929                 *dst++ = c1, src--;
1930             }
1931           else if (coding->eol_type == CODING_EOL_CR)
1932             *dst++ = '\n';
1933           else
1934             *dst++ = c1;
1935         }
1936       else if (c1 < 0x20)
1937         *dst++ = c1;
1938       else if (c1 < 0x80)
1939         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1940       else if (c1 < 0xA0 || c1 >= 0xE0)
1941         {
1942           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1943           if (sjis_p)
1944             {
1945               ONE_MORE_BYTE (c2);
1946               DECODE_SJIS (c1, c2, c3, c4);
1947               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1948             }
1949           else if (c1 >= 0xE0 && c1 < 0xFF)
1950             {
1951               int charset;
1952
1953               ONE_MORE_BYTE (c2);
1954               DECODE_BIG5 (c1, c2, charset, c3, c4);
1955               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1956             }
1957           else                  /* Invalid code */
1958             *dst++ = c1;
1959         }
1960       else
1961         {
1962           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1963           if (sjis_p)
1964             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1965           else
1966             {
1967               int charset;
1968
1969               ONE_MORE_BYTE (c2);
1970               DECODE_BIG5 (c1, c2, charset, c3, c4);
1971               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1972             }
1973         }
1974       continue;
1975
1976     label_end_of_loop:
1977       coding->carryover_size = src - src_base;
1978       bcopy (src_base, coding->carryover, coding->carryover_size);
1979       src = src_base;
1980       break;
1981     }
1982
1983   *consumed = src - source;
1984   return dst - destination;
1985 }
1986
1987 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1988    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1989    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1990    sure that all these charsets are registered as official charset
1991    (i.e. do not have extended leading-codes).  Characters of other
1992    charsets are produced without any encoding.  If SJIS_P is 1, encode
1993    SJIS text, else encode BIG5 text.  */
1994
1995 int
1996 encode_coding_sjis_big5 (coding, source, destination,
1997                          src_bytes, dst_bytes, consumed, sjis_p)
1998      struct coding_system *coding;
1999      unsigned char *source, *destination;
2000      int src_bytes, dst_bytes;
2001      int *consumed;
2002      int sjis_p;
2003 {
2004   unsigned char *src = source;
2005   unsigned char *src_end = source + src_bytes;
2006   unsigned char *dst = destination;
2007   unsigned char *dst_end = destination + dst_bytes;
2008   /* Since the maximum bytes produced by each loop is 2, we subtract 1
2009      from DST_END to assure overflow checking is necessary only at the
2010      head of loop.  */
2011   unsigned char *adjusted_dst_end = dst_end - 1;
2012   Lisp_Object unification_table
2013       = coding->character_unification_table_for_encode;
2014
2015   if (!NILP (Venable_character_unification) && NILP (unification_table))
2016     unification_table = Vstandard_character_unification_table_for_encode;
2017
2018   while (src < src_end && dst < adjusted_dst_end)
2019     {
2020       /* SRC_BASE remembers the start position in source in each loop.
2021          The loop will be exited when there's not enough source text
2022          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2023          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2024          before exiting.  */
2025       unsigned char *src_base = src;
2026       unsigned char c1 = *src++, c2, c3, c4;
2027
2028       if (coding->composing)
2029         {
2030           if (c1 == 0xA0)
2031             {
2032               ONE_MORE_BYTE (c1);
2033               c1 &= 0x7F;
2034             }
2035           else if (c1 >= 0xA0)
2036             c1 -= 0x20;
2037           else
2038             coding->composing = 0;
2039         }
2040
2041       switch (emacs_code_class[c1])
2042         {
2043         case EMACS_ascii_code:
2044           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2045           break;
2046
2047         case EMACS_control_code:
2048           *dst++ = c1;
2049           break;
2050
2051         case EMACS_carriage_return_code:
2052           if (!coding->selective)
2053             {
2054               *dst++ = c1;
2055               break;
2056             }
2057           /* fall down to treat '\r' as '\n' ...  */
2058
2059         case EMACS_linefeed_code:
2060           if (coding->eol_type == CODING_EOL_LF
2061               || coding->eol_type == CODING_EOL_UNDECIDED)
2062             *dst++ = '\n';
2063           else if (coding->eol_type == CODING_EOL_CRLF)
2064             *dst++ = '\r', *dst++ = '\n';
2065           else
2066             *dst++ = '\r';
2067           break;
2068
2069         case EMACS_leading_code_2:
2070           ONE_MORE_BYTE (c2);
2071           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2072           break;
2073
2074         case EMACS_leading_code_3:
2075           TWO_MORE_BYTES (c2, c3);
2076           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2077           break;
2078
2079         case EMACS_leading_code_4:
2080           THREE_MORE_BYTES (c2, c3, c4);
2081           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2082           break;
2083
2084         case EMACS_leading_code_composition:
2085           coding->composing = 1;
2086           break;
2087
2088         default:                /* i.e. case EMACS_invalid_code: */
2089           *dst++ = c1;
2090         }
2091       continue;
2092
2093     label_end_of_loop:
2094       coding->carryover_size = src_end - src_base;
2095       bcopy (src_base, coding->carryover, coding->carryover_size);
2096       src = src_end;
2097       break;
2098     }
2099
2100   *consumed = src - source;
2101   return dst - destination;
2102 }
2103
2104 \f
2105 /*** 5. End-of-line handlers ***/
2106
2107 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2108    This function is called only when `coding->eol_type' is
2109    CODING_EOL_CRLF or CODING_EOL_CR.  */
2110
2111 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2112      struct coding_system *coding;
2113      unsigned char *source, *destination;
2114      int src_bytes, dst_bytes;
2115      int *consumed;
2116 {
2117   unsigned char *src = source;
2118   unsigned char *src_end = source + src_bytes;
2119   unsigned char *dst = destination;
2120   unsigned char *dst_end = destination + dst_bytes;
2121   int produced;
2122
2123   switch (coding->eol_type)
2124     {
2125     case CODING_EOL_CRLF:
2126       {
2127         /* Since the maximum bytes produced by each loop is 2, we
2128            subtract 1 from DST_END to assure overflow checking is
2129            necessary only at the head of loop.  */
2130         unsigned char *adjusted_dst_end = dst_end - 1;
2131
2132         while (src < src_end && dst < adjusted_dst_end)
2133           {
2134             unsigned char *src_base = src;
2135             unsigned char c = *src++;
2136             if (c == '\r')
2137               {
2138                 ONE_MORE_BYTE (c);
2139                 if (c != '\n')
2140                   *dst++ = '\r';
2141                 *dst++ = c;
2142               }
2143             else
2144               *dst++ = c;
2145             continue;
2146
2147           label_end_of_loop:
2148             coding->carryover_size = src - src_base;
2149             bcopy (src_base, coding->carryover, coding->carryover_size);
2150             src = src_base;
2151             break;
2152           }
2153         *consumed = src - source;
2154         produced = dst - destination;
2155         break;
2156       }
2157
2158     case CODING_EOL_CR:
2159       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2160       bcopy (source, destination, produced);
2161       dst_end = destination + produced;
2162       while (dst < dst_end)
2163         if (*dst++ == '\r') dst[-1] = '\n';
2164       *consumed = produced;
2165       break;
2166
2167     default:                    /* i.e. case: CODING_EOL_LF */
2168       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2169       bcopy (source, destination, produced);
2170       *consumed = produced;
2171       break;
2172     }
2173
2174   return produced;
2175 }
2176
2177 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2178    format of end-of-line according to `coding->eol_type'.  If
2179    `coding->selective' is 1, code '\r' in source text also means
2180    end-of-line.  */
2181
2182 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2183      struct coding_system *coding;
2184      unsigned char *source, *destination;
2185      int src_bytes, dst_bytes;
2186      int *consumed;
2187 {
2188   unsigned char *src = source;
2189   unsigned char *dst = destination;
2190   int produced;
2191
2192   if (src_bytes <= 0)
2193     return 0;
2194
2195   switch (coding->eol_type)
2196     {
2197     case CODING_EOL_LF:
2198     case CODING_EOL_UNDECIDED:
2199       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2200       bcopy (source, destination, produced);
2201       if (coding->selective)
2202         {
2203           int i = produced;
2204           while (i--)
2205             if (*dst++ == '\r') dst[-1] = '\n';
2206         }
2207       *consumed = produced;
2208
2209     case CODING_EOL_CRLF:
2210       {
2211         unsigned char c;
2212         unsigned char *src_end = source + src_bytes;
2213         unsigned char *dst_end = destination + dst_bytes;
2214         /* Since the maximum bytes produced by each loop is 2, we
2215            subtract 1 from DST_END to assure overflow checking is
2216            necessary only at the head of loop.  */
2217         unsigned char *adjusted_dst_end = dst_end - 1;
2218
2219         while (src < src_end && dst < adjusted_dst_end)
2220           {
2221             c = *src++;
2222             if (c == '\n' || (c == '\r' && coding->selective))
2223               *dst++ = '\r', *dst++ = '\n';
2224             else
2225               *dst++ = c;
2226           }
2227         produced = dst - destination;
2228         *consumed = src - source;
2229         break;
2230       }
2231
2232     default:                    /* i.e. case CODING_EOL_CR: */
2233       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2234       bcopy (source, destination, produced);
2235       {
2236         int i = produced;
2237         while (i--)
2238           if (*dst++ == '\n') dst[-1] = '\r';
2239       }
2240       *consumed = produced;
2241     }
2242
2243   return produced;
2244 }
2245
2246 \f
2247 /*** 6. C library functions ***/
2248
2249 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2250    has a property `coding-system'.  The value of this property is a
2251    vector of length 5 (called as coding-vector).  Among elements of
2252    this vector, the first (element[0]) and the fifth (element[4])
2253    carry important information for decoding/encoding.  Before
2254    decoding/encoding, this information should be set in fields of a
2255    structure of type `coding_system'.
2256
2257    A value of property `coding-system' can be a symbol of another
2258    subsidiary coding-system.  In that case, Emacs gets coding-vector
2259    from that symbol.
2260
2261    `element[0]' contains information to be set in `coding->type'.  The
2262    value and its meaning is as follows:
2263
2264    0 -- coding_type_emacs_mule
2265    1 -- coding_type_sjis
2266    2 -- coding_type_iso2022
2267    3 -- coding_type_big5
2268    4 -- coding_type_ccl encoder/decoder written in CCL
2269    nil -- coding_type_no_conversion
2270    t -- coding_type_undecided (automatic conversion on decoding,
2271                                no-conversion on encoding)
2272
2273    `element[4]' contains information to be set in `coding->flags' and
2274    `coding->spec'.  The meaning varies by `coding->type'.
2275
2276    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2277    of length 32 (of which the first 13 sub-elements are used now).
2278    Meanings of these sub-elements are:
2279
2280    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2281         If the value is an integer of valid charset, the charset is
2282         assumed to be designated to graphic register N initially.
2283
2284         If the value is minus, it is a minus value of charset which
2285         reserves graphic register N, which means that the charset is
2286         not designated initially but should be designated to graphic
2287         register N just before encoding a character in that charset.
2288
2289         If the value is nil, graphic register N is never used on
2290         encoding.
2291
2292    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2293         Each value takes t or nil.  See the section ISO2022 of
2294         `coding.h' for more information.
2295
2296    If `coding->type' is `coding_type_big5', element[4] is t to denote
2297    BIG5-ETen or nil to denote BIG5-HKU.
2298
2299    If `coding->type' takes the other value, element[4] is ignored.
2300
2301    Emacs Lisp's coding system also carries information about format of
2302    end-of-line in a value of property `eol-type'.  If the value is
2303    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2304    means CODING_EOL_CR.  If it is not integer, it should be a vector
2305    of subsidiary coding systems of which property `eol-type' has one
2306    of above values.
2307
2308 */
2309
2310 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2311    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2312    is setup so that no conversion is necessary and return -1, else
2313    return 0.  */
2314
2315 int
2316 setup_coding_system (coding_system, coding)
2317      Lisp_Object coding_system;
2318      struct coding_system *coding;
2319 {
2320   Lisp_Object coding_spec, plist, type, eol_type;
2321   Lisp_Object val;
2322   int i;
2323
2324   /* At first, set several fields to default values.  */
2325   coding->last_block = 0;
2326   coding->selective = 0;
2327   coding->composing = 0;
2328   coding->direction = 0;
2329   coding->carryover_size = 0;
2330   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2331   coding->character_unification_table_for_decode = Qnil;
2332   coding->character_unification_table_for_encode = Qnil;
2333
2334   coding->symbol = coding_system;
2335   eol_type = Qnil;
2336
2337   /* Get values of property `coding-system' and `eol-type'.
2338      Also get values of coding system properties:
2339      `post-read-conversion', `pre-write-conversion',
2340      `character-unification-table-for-decode',
2341      `character-unification-table-for-encode'.  */
2342   coding_spec = Fget (coding_system, Qcoding_system);
2343   if (!VECTORP (coding_spec)
2344       || XVECTOR (coding_spec)->size != 5
2345       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2346     goto label_invalid_coding_system;
2347   if (!inhibit_eol_conversion)
2348     eol_type = Fget (coding_system, Qeol_type);
2349
2350   plist = XVECTOR (coding_spec)->contents[3];
2351   coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2352   coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2353   val = Fplist_get (plist, Qcharacter_unification_table_for_decode);
2354   if (SYMBOLP (val))
2355     val = Fget (val, Qcharacter_unification_table_for_decode);
2356   coding->character_unification_table_for_decode
2357     = CHAR_TABLE_P (val) ? val : Qnil;
2358   val = Fplist_get (plist, Qcharacter_unification_table_for_encode);
2359   if (SYMBOLP (val))
2360     val = Fget (val, Qcharacter_unification_table_for_encode);
2361   coding->character_unification_table_for_encode
2362     = CHAR_TABLE_P (val) ? val : Qnil;
2363
2364   val = Fplist_get (plist, Qsafe_charsets);
2365   if (EQ (val, Qt))
2366     {
2367       for (i = 0; i <= MAX_CHARSET; i++)
2368         coding->safe_charsets[i] = 1;
2369     }
2370   else
2371     {
2372       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2373       while (CONSP (val))
2374         {
2375           if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2376             coding->safe_charsets[i] = 1;
2377           val = XCONS (val)->cdr;
2378         }
2379     }
2380
2381   if (VECTORP (eol_type))
2382     {
2383       coding->eol_type = CODING_EOL_UNDECIDED;
2384       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2385     }
2386   else if (XFASTINT (eol_type) == 1)
2387     {
2388       coding->eol_type = CODING_EOL_CRLF;
2389       coding->common_flags
2390         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2391     }
2392   else if (XFASTINT (eol_type) == 2)
2393     {
2394       coding->eol_type = CODING_EOL_CR;
2395       coding->common_flags
2396         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2397     }
2398   else
2399     {
2400       coding->eol_type = CODING_EOL_LF;
2401       coding->common_flags = 0;
2402     }
2403
2404   type = XVECTOR (coding_spec)->contents[0];
2405   switch (XFASTINT (type))
2406     {
2407     case 0:
2408       coding->type = coding_type_emacs_mule;
2409       if (!NILP (coding->post_read_conversion))
2410         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2411       if (!NILP (coding->pre_write_conversion))
2412         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2413       break;
2414
2415     case 1:
2416       coding->type = coding_type_sjis;
2417       coding->common_flags
2418         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2419       break;
2420
2421     case 2:
2422       coding->type = coding_type_iso2022;
2423       coding->common_flags
2424         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2425       {
2426         Lisp_Object val, temp;
2427         Lisp_Object *flags;
2428         int i, charset, default_reg_bits = 0;
2429
2430         val = XVECTOR (coding_spec)->contents[4];
2431
2432         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2433           goto label_invalid_coding_system;
2434
2435         flags = XVECTOR (val)->contents;
2436         coding->flags
2437           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2438              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2439              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2440              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2441              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2442              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2443              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2444              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2445              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2446              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2447              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2448              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2449              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2450              );
2451
2452         /* Invoke graphic register 0 to plane 0.  */
2453         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2454         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2455         CODING_SPEC_ISO_INVOCATION (coding, 1)
2456           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2457         /* Not single shifting at first.  */
2458         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2459         /* Beginning of buffer should also be regarded as bol. */
2460         CODING_SPEC_ISO_BOL (coding) = 1;
2461
2462         for (charset = 0; charset <= MAX_CHARSET; charset++)
2463           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2464         val = Vcharset_revision_alist;
2465         while (CONSP (val))
2466           {
2467             charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2468             if (charset >= 0
2469                 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2470                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2471               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2472             val = XCONS (val)->cdr;
2473           }
2474
2475         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2476            FLAGS[REG] can be one of below:
2477                 integer CHARSET: CHARSET occupies register I,
2478                 t: designate nothing to REG initially, but can be used
2479                   by any charsets,
2480                 list of integer, nil, or t: designate the first
2481                   element (if integer) to REG initially, the remaining
2482                   elements (if integer) is designated to REG on request,
2483                   if an element is t, REG can be used by any charset,
2484                 nil: REG is never used.  */
2485         for (charset = 0; charset <= MAX_CHARSET; charset++)
2486           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2487             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2488         for (i = 0; i < 4; i++)
2489           {
2490             if (INTEGERP (flags[i])
2491                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2492                 || (charset = get_charset_id (flags[i])) >= 0)
2493               {
2494                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2495                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2496               }
2497             else if (EQ (flags[i], Qt))
2498               {
2499                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2500                 default_reg_bits |= 1 << i;
2501               }
2502             else if (CONSP (flags[i]))
2503               {
2504                 Lisp_Object tail = flags[i];
2505
2506                 if (INTEGERP (XCONS (tail)->car)
2507                     && (charset = XINT (XCONS (tail)->car),
2508                         CHARSET_VALID_P (charset))
2509                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2510                   {
2511                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2512                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2513                   }
2514                 else
2515                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2516                 tail = XCONS (tail)->cdr;
2517                 while (CONSP (tail))
2518                   {
2519                     if (INTEGERP (XCONS (tail)->car)
2520                         && (charset = XINT (XCONS (tail)->car),
2521                             CHARSET_VALID_P (charset))
2522                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2523                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2524                         = i;
2525                     else if (EQ (XCONS (tail)->car, Qt))
2526                       default_reg_bits |= 1 << i;
2527                     tail = XCONS (tail)->cdr;
2528                   }
2529               }
2530             else
2531               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2532
2533             CODING_SPEC_ISO_DESIGNATION (coding, i)
2534               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2535           }
2536
2537         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2538           {
2539             /* REG 1 can be used only by locking shift in 7-bit env.  */
2540             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2541               default_reg_bits &= ~2;
2542             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2543               /* Without any shifting, only REG 0 and 1 can be used.  */
2544               default_reg_bits &= 3;
2545           }
2546
2547         for (charset = 0; charset <= MAX_CHARSET; charset++)
2548           if (CHARSET_VALID_P (charset)
2549               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2550                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2551             {
2552               /* We have not yet decided where to designate CHARSET.  */
2553               int reg_bits = default_reg_bits;
2554
2555               if (CHARSET_CHARS (charset) == 96)
2556                 /* A charset of CHARS96 can't be designated to REG 0.  */
2557                 reg_bits &= ~1;
2558
2559               if (reg_bits)
2560                 /* There exist some default graphic register.  */
2561                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2562                   = (reg_bits & 1
2563                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2564               else
2565                 /* We anyway have to designate CHARSET to somewhere.  */
2566                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2567                   = (CHARSET_CHARS (charset) == 94
2568                      ? 0
2569                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2570                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2571                         ? 1
2572                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2573                            ? 2 : 0)));
2574             }
2575       }
2576       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
2577       break;
2578
2579     case 3:
2580       coding->type = coding_type_big5;
2581       coding->common_flags
2582         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2583       coding->flags
2584         = (NILP (XVECTOR (coding_spec)->contents[4])
2585            ? CODING_FLAG_BIG5_HKU
2586            : CODING_FLAG_BIG5_ETEN);
2587       break;
2588
2589     case 4:
2590       coding->type = coding_type_ccl;
2591       coding->common_flags
2592         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2593       {
2594         Lisp_Object val = XVECTOR (coding_spec)->contents[4];
2595         if (CONSP  (val)
2596             && VECTORP (XCONS (val)->car)
2597             && VECTORP (XCONS (val)->cdr))
2598           {
2599             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2600             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2601           }
2602         else
2603           goto label_invalid_coding_system;
2604       }
2605       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
2606       break;
2607
2608     case 5:
2609       coding->type = coding_type_raw_text;
2610       break;
2611
2612     default:
2613       if (EQ (type, Qt))
2614         {
2615           coding->type = coding_type_undecided;
2616           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2617         }
2618       else
2619         coding->type = coding_type_no_conversion;
2620       break;
2621     }
2622   return 0;
2623
2624  label_invalid_coding_system:
2625   coding->type = coding_type_no_conversion;
2626   coding->common_flags = 0;
2627   coding->eol_type = CODING_EOL_LF;
2628   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2629     = Qnil;
2630   return -1;
2631 }
2632
2633 /* Emacs has a mechanism to automatically detect a coding system if it
2634    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2635    it's impossible to distinguish some coding systems accurately
2636    because they use the same range of codes.  So, at first, coding
2637    systems are categorized into 7, those are:
2638
2639    o coding-category-emacs-mule
2640
2641         The category for a coding system which has the same code range
2642         as Emacs' internal format.  Assigned the coding-system (Lisp
2643         symbol) `emacs-mule' by default.
2644
2645    o coding-category-sjis
2646
2647         The category for a coding system which has the same code range
2648         as SJIS.  Assigned the coding-system (Lisp
2649         symbol) `japanese-shift-jis' by default.
2650
2651    o coding-category-iso-7
2652
2653         The category for a coding system which has the same code range
2654         as ISO2022 of 7-bit environment.  This doesn't use any locking
2655         shift and single shift functions.  Assigned the coding-system
2656         (Lisp symbol) `iso-2022-7bit' by default.
2657
2658    o coding-category-iso-8-1
2659
2660         The category for a coding system which has the same code range
2661         as ISO2022 of 8-bit environment and graphic plane 1 used only
2662         for DIMENSION1 charset.  This doesn't use any locking shift
2663         and single shift functions.  Assigned the coding-system (Lisp
2664         symbol) `iso-latin-1' by default.
2665
2666    o coding-category-iso-8-2
2667
2668         The category for a coding system which has the same code range
2669         as ISO2022 of 8-bit environment and graphic plane 1 used only
2670         for DIMENSION2 charset.  This doesn't use any locking shift
2671         and single shift functions.  Assigned the coding-system (Lisp
2672         symbol) `japanese-iso-8bit' by default.
2673
2674    o coding-category-iso-7-else
2675
2676         The category for a coding system which has the same code range
2677         as ISO2022 of 7-bit environemnt but uses locking shift or
2678         single shift functions.  Assigned the coding-system (Lisp
2679         symbol) `iso-2022-7bit-lock' by default.
2680
2681    o coding-category-iso-8-else
2682
2683         The category for a coding system which has the same code range
2684         as ISO2022 of 8-bit environemnt but uses locking shift or
2685         single shift functions.  Assigned the coding-system (Lisp
2686         symbol) `iso-2022-8bit-ss2' by default.
2687
2688    o coding-category-big5
2689
2690         The category for a coding system which has the same code range
2691         as BIG5.  Assigned the coding-system (Lisp symbol)
2692         `cn-big5' by default.
2693
2694    o coding-category-binary
2695
2696         The category for a coding system not categorized in any of the
2697         above.  Assigned the coding-system (Lisp symbol)
2698         `no-conversion' by default.
2699
2700    Each of them is a Lisp symbol and the value is an actual
2701    `coding-system's (this is also a Lisp symbol) assigned by a user.
2702    What Emacs does actually is to detect a category of coding system.
2703    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2704    decide only one possible category, it selects a category of the
2705    highest priority.  Priorities of categories are also specified by a
2706    user in a Lisp variable `coding-category-list'.
2707
2708 */
2709
2710 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2711    If it detects possible coding systems, return an integer in which
2712    appropriate flag bits are set.  Flag bits are defined by macros
2713    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2714
2715 int
2716 detect_coding_mask (src, src_bytes)
2717      unsigned char *src;
2718      int src_bytes;
2719 {
2720   register unsigned char c;
2721   unsigned char *src_end = src + src_bytes;
2722   int mask;
2723
2724   /* At first, skip all ASCII characters and control characters except
2725      for three ISO2022 specific control characters.  */
2726  label_loop_detect_coding:
2727   while (src < src_end)
2728     {
2729       c = *src;
2730       if (c >= 0x80
2731           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2732         break;
2733       src++;
2734     }
2735
2736   if (src >= src_end)
2737     /* We found nothing other than ASCII.  There's nothing to do.  */
2738     return CODING_CATEGORY_MASK_ANY;
2739
2740   /* The text seems to be encoded in some multilingual coding system.
2741      Now, try to find in which coding system the text is encoded.  */
2742   if (c < 0x80)
2743     {
2744       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2745       /* C is an ISO2022 specific control code of C0.  */
2746       mask = detect_coding_iso2022 (src, src_end);
2747       src++;
2748       if (mask == 0)
2749         /* No valid ISO2022 code follows C.  Try again.  */
2750         goto label_loop_detect_coding;
2751       mask |= CODING_CATEGORY_MASK_RAW_TEXT;
2752     }
2753   else if (c < 0xA0)
2754     {
2755       /* If C is a special latin extra code,
2756          or is an ISO2022 specific control code of C1 (SS2 or SS3),
2757          or is an ISO2022 control-sequence-introducer (CSI),
2758          we should also consider the possibility of ISO2022 codings.  */
2759       if ((VECTORP (Vlatin_extra_code_table)
2760            && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2761           || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2762           || (c == ISO_CODE_CSI
2763               && (src < src_end
2764                   && (*src == ']'
2765                       || (src + 1 < src_end
2766                           && src[1] == ']'
2767                           && (*src == '0' || *src == '1' || *src == '2'))))))
2768         mask = (detect_coding_iso2022 (src, src_end)
2769                 | detect_coding_sjis (src, src_end)
2770                 | detect_coding_emacs_mule (src, src_end)
2771                 | CODING_CATEGORY_MASK_RAW_TEXT);
2772
2773       else
2774         /* C is the first byte of SJIS character code,
2775            or a leading-code of Emacs' internal format (emacs-mule).  */
2776         mask = (detect_coding_sjis (src, src_end)
2777                 | detect_coding_emacs_mule (src, src_end)
2778                 | CODING_CATEGORY_MASK_RAW_TEXT);
2779     }
2780   else
2781     /* C is a character of ISO2022 in graphic plane right,
2782        or a SJIS's 1-byte character code (i.e. JISX0201),
2783        or the first byte of BIG5's 2-byte code.  */
2784     mask = (detect_coding_iso2022 (src, src_end)
2785             | detect_coding_sjis (src, src_end)
2786             | detect_coding_big5 (src, src_end)
2787             | CODING_CATEGORY_MASK_RAW_TEXT);
2788
2789   return mask;
2790 }
2791
2792 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2793    The information of the detected coding system is set in CODING.  */
2794
2795 void
2796 detect_coding (coding, src, src_bytes)
2797      struct coding_system *coding;
2798      unsigned char *src;
2799      int src_bytes;
2800 {
2801   int mask = detect_coding_mask (src, src_bytes);
2802   int idx;
2803   Lisp_Object val = Vcoding_category_list;
2804
2805   if (mask == CODING_CATEGORY_MASK_ANY)
2806     /* We found nothing other than ASCII.  There's nothing to do.  */
2807     return;
2808
2809   /* We found some plausible coding systems.  Let's use a coding
2810      system of the highest priority.  */
2811
2812   if (CONSP (val))
2813     while (!NILP (val))
2814       {
2815         idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2816         if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2817           break;
2818         val = XCONS (val)->cdr;
2819       }
2820   else
2821     val = Qnil;
2822
2823   if (NILP (val))
2824     {
2825       /* For unknown reason, `Vcoding_category_list' contains none of
2826          found categories.  Let's use any of them.  */
2827       for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2828         if (mask & (1 << idx))
2829           break;
2830     }
2831   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2832 }
2833
2834 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2835    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2836    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2837
2838 #define MAX_EOL_CHECK_COUNT 3
2839
2840 int
2841 detect_eol_type (src, src_bytes)
2842      unsigned char *src;
2843      int src_bytes;
2844 {
2845   unsigned char *src_end = src + src_bytes;
2846   unsigned char c;
2847   int total = 0;                /* How many end-of-lines are found so far.  */
2848   int eol_type = CODING_EOL_UNDECIDED;
2849   int this_eol_type;
2850
2851   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2852     {
2853       c = *src++;
2854       if (c == '\n' || c == '\r')
2855         {
2856           total++;
2857           if (c == '\n')
2858             this_eol_type = CODING_EOL_LF;
2859           else if (src >= src_end || *src != '\n')
2860             this_eol_type = CODING_EOL_CR;
2861           else
2862             this_eol_type = CODING_EOL_CRLF, src++;
2863
2864           if (eol_type == CODING_EOL_UNDECIDED)
2865             /* This is the first end-of-line.  */
2866             eol_type = this_eol_type;
2867           else if (eol_type != this_eol_type)
2868             /* The found type is different from what found before.
2869                Let's notice the caller about this inconsistency.  */
2870             return CODING_EOL_INCONSISTENT;
2871         }
2872     }
2873
2874   return eol_type;
2875 }
2876
2877 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2878    is encoded.  If it detects an appropriate format of end-of-line, it
2879    sets the information in *CODING.  */
2880
2881 void
2882 detect_eol (coding, src, src_bytes)
2883      struct coding_system *coding;
2884      unsigned char *src;
2885      int src_bytes;
2886 {
2887   Lisp_Object val;
2888   int eol_type = detect_eol_type (src, src_bytes);
2889
2890   if (eol_type == CODING_EOL_UNDECIDED)
2891     /*  We found no end-of-line in the source text.  */
2892     return;
2893
2894   if (eol_type == CODING_EOL_INCONSISTENT)
2895     {
2896 #if 0
2897       /* This code is suppressed until we find a better way to
2898          distinguish raw text file and binary file.  */
2899
2900       /* If we have already detected that the coding is raw-text, the
2901          coding should actually be no-conversion.  */
2902       if (coding->type == coding_type_raw_text)
2903         {
2904           setup_coding_system (Qno_conversion, coding);
2905           return;
2906         }
2907       /* Else, let's decode only text code anyway.  */
2908 #endif /* 0 */
2909       eol_type = CODING_EOL_LF;
2910     }
2911
2912   val = Fget (coding->symbol, Qeol_type);
2913   if (VECTORP (val) && XVECTOR (val)->size == 3)
2914     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2915 }
2916
2917 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2918    decoding, it may detect coding system and format of end-of-line if
2919    those are not yet decided.  */
2920
2921 int
2922 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2923      struct coding_system *coding;
2924      unsigned char *source, *destination;
2925      int src_bytes, dst_bytes;
2926      int *consumed;
2927 {
2928   int produced;
2929
2930   if (src_bytes <= 0)
2931     {
2932       *consumed = 0;
2933       return 0;
2934     }
2935
2936   if (coding->type == coding_type_undecided)
2937     detect_coding (coding, source, src_bytes);
2938
2939   if (coding->eol_type == CODING_EOL_UNDECIDED)
2940     detect_eol (coding, source, src_bytes);
2941
2942   coding->carryover_size = 0;
2943   switch (coding->type)
2944     {
2945     case coding_type_no_conversion:
2946     label_no_conversion:
2947       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2948       bcopy (source, destination, produced);
2949       *consumed = produced;
2950       break;
2951
2952     case coding_type_emacs_mule:
2953     case coding_type_undecided:
2954     case coding_type_raw_text:
2955       if (coding->eol_type == CODING_EOL_LF
2956           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2957         goto label_no_conversion;
2958       produced = decode_eol (coding, source, destination,
2959                              src_bytes, dst_bytes, consumed);
2960       break;
2961
2962     case coding_type_sjis:
2963       produced = decode_coding_sjis_big5 (coding, source, destination,
2964                                           src_bytes, dst_bytes, consumed,
2965                                           1);
2966       break;
2967
2968     case coding_type_iso2022:
2969       produced = decode_coding_iso2022 (coding, source, destination,
2970                                         src_bytes, dst_bytes, consumed);
2971       break;
2972
2973     case coding_type_big5:
2974       produced = decode_coding_sjis_big5 (coding, source, destination,
2975                                           src_bytes, dst_bytes, consumed,
2976                                           0);
2977       break;
2978
2979     case coding_type_ccl:
2980       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2981                              src_bytes, dst_bytes, consumed);
2982       break;
2983     }
2984
2985   return produced;
2986 }
2987
2988 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2989
2990 int
2991 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2992      struct coding_system *coding;
2993      unsigned char *source, *destination;
2994      int src_bytes, dst_bytes;
2995      int *consumed;
2996 {
2997   int produced;
2998
2999   switch (coding->type)
3000     {
3001     case coding_type_no_conversion:
3002     label_no_conversion:
3003       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
3004       if (produced > 0)
3005         {
3006           bcopy (source, destination, produced);
3007           if (coding->selective)
3008             {
3009               unsigned char *p = destination, *pend = destination + produced;
3010               while (p < pend)
3011                 if (*p++ == '\015') p[-1] = '\n';
3012             }
3013         }
3014       *consumed = produced;
3015       break;
3016
3017     case coding_type_emacs_mule:
3018     case coding_type_undecided:
3019     case coding_type_raw_text:
3020       if (coding->eol_type == CODING_EOL_LF
3021           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3022         goto label_no_conversion;
3023       produced = encode_eol (coding, source, destination,
3024                              src_bytes, dst_bytes, consumed);
3025       break;
3026
3027     case coding_type_sjis:
3028       produced = encode_coding_sjis_big5 (coding, source, destination,
3029                                           src_bytes, dst_bytes, consumed,
3030                                           1);
3031       break;
3032
3033     case coding_type_iso2022:
3034       produced = encode_coding_iso2022 (coding, source, destination,
3035                                         src_bytes, dst_bytes, consumed);
3036       break;
3037
3038     case coding_type_big5:
3039       produced = encode_coding_sjis_big5 (coding, source, destination,
3040                                           src_bytes, dst_bytes, consumed,
3041                                           0);
3042       break;
3043
3044     case coding_type_ccl:
3045       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
3046                              src_bytes, dst_bytes, consumed);
3047       break;
3048     }
3049
3050   return produced;
3051 }
3052
3053 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3054
3055 /* Return maximum size (bytes) of a buffer enough for decoding
3056    SRC_BYTES of text encoded in CODING.  */
3057
3058 int
3059 decoding_buffer_size (coding, src_bytes)
3060      struct coding_system *coding;
3061      int src_bytes;
3062 {
3063   int magnification;
3064
3065   if (coding->type == coding_type_iso2022)
3066     magnification = 3;
3067   else if (coding->type == coding_type_ccl)
3068     magnification = coding->spec.ccl.decoder.buf_magnification;
3069   else
3070     magnification = 2;
3071
3072   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3073 }
3074
3075 /* Return maximum size (bytes) of a buffer enough for encoding
3076    SRC_BYTES of text to CODING.  */
3077
3078 int
3079 encoding_buffer_size (coding, src_bytes)
3080      struct coding_system *coding;
3081      int src_bytes;
3082 {
3083   int magnification;
3084
3085   if (coding->type == coding_type_ccl)
3086     magnification = coding->spec.ccl.encoder.buf_magnification;
3087   else
3088     magnification = 3;
3089
3090   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3091 }
3092
3093 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3094 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3095 #endif
3096
3097 char *conversion_buffer;
3098 int conversion_buffer_size;
3099
3100 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3101    or decoding.  Sufficient memory is allocated automatically.  If we
3102    run out of memory, return NULL.  */
3103
3104 char *
3105 get_conversion_buffer (size)
3106      int size;
3107 {
3108   if (size > conversion_buffer_size)
3109     {
3110       char *buf;
3111       int real_size = conversion_buffer_size * 2;
3112
3113       while (real_size < size) real_size *= 2;
3114       buf = (char *) xmalloc (real_size);
3115       xfree (conversion_buffer);
3116       conversion_buffer = buf;
3117       conversion_buffer_size = real_size;
3118     }
3119   return conversion_buffer;
3120 }
3121
3122 \f
3123 #ifdef emacs
3124 /*** 7. Emacs Lisp library functions ***/
3125
3126 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3127   "Return t if OBJECT is nil or a coding-system.\n\
3128 See document of make-coding-system for coding-system object.")
3129   (obj)
3130      Lisp_Object obj;
3131 {
3132   if (NILP (obj))
3133     return Qt;
3134   if (!SYMBOLP (obj))
3135     return Qnil;
3136   /* Get coding-spec vector for OBJ.  */
3137   obj = Fget (obj, Qcoding_system);
3138   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
3139           ? Qt : Qnil);
3140 }
3141
3142 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3143        Sread_non_nil_coding_system, 1, 1, 0,
3144   "Read a coding system from the minibuffer, prompting with string PROMPT.")
3145   (prompt)
3146      Lisp_Object prompt;
3147 {
3148   Lisp_Object val;
3149   do
3150     {
3151       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
3152                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
3153     }
3154   while (XSTRING (val)->size == 0);
3155   return (Fintern (val, Qnil));
3156 }
3157
3158 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
3159   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
3160 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
3161   (prompt, default_coding_system)
3162      Lisp_Object prompt, default_coding_system;
3163 {
3164   Lisp_Object val;
3165   if (SYMBOLP (default_coding_system))
3166     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
3167   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
3168                           Qt, Qnil, Qcoding_system_history,
3169                           default_coding_system, Qnil);
3170   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3171 }
3172
3173 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3174        1, 1, 0,
3175   "Check validity of CODING-SYSTEM.\n\
3176 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3177 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3178 The value of property should be a vector of length 5.")
3179   (coding_system)
3180      Lisp_Object coding_system;
3181 {
3182   CHECK_SYMBOL (coding_system, 0);
3183   if (!NILP (Fcoding_system_p (coding_system)))
3184     return coding_system;
3185   while (1)
3186     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3187 }
3188
3189 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3190        2, 2, 0,
3191   "Detect coding system of the text in the region between START and END.\n\
3192 Return a list of possible coding systems ordered by priority.\n\
3193 If only ASCII characters are found, it returns `undecided'\n\
3194  or its subsidiary coding system according to a detected end-of-line format.")
3195   (b, e)
3196      Lisp_Object b, e;
3197 {
3198   int coding_mask, eol_type;
3199   Lisp_Object val;
3200   int beg, end;
3201   int beg_byte, end_byte;
3202
3203   validate_region (&b, &e);
3204   beg = XINT (b), end = XINT (e);
3205   beg_byte = CHAR_TO_BYTE (beg);
3206   end_byte = CHAR_TO_BYTE (end);
3207
3208   if (beg < GPT && end >= GPT)
3209     move_gap_both (end, end_byte);
3210
3211   coding_mask = detect_coding_mask (BYTE_POS_ADDR (beg_byte),
3212                                     end_byte - beg_byte);
3213   eol_type = detect_eol_type (BYTE_POS_ADDR (beg_byte), end_byte - beg_byte);
3214
3215   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3216     {
3217       val = Qundecided;
3218       if (eol_type != CODING_EOL_UNDECIDED
3219           && eol_type != CODING_EOL_INCONSISTENT)
3220         {
3221           Lisp_Object val2;
3222           val2 = Fget (Qundecided, Qeol_type);
3223           if (VECTORP (val2))
3224             val = XVECTOR (val2)->contents[eol_type];
3225         }
3226     }
3227   else
3228     {
3229       Lisp_Object val2;
3230
3231       /* At first, gather possible coding-systems in VAL in a reverse
3232          order.  */
3233       val = Qnil;
3234       for (val2 = Vcoding_category_list;
3235            !NILP (val2);
3236            val2 = XCONS (val2)->cdr)
3237         {
3238           int idx
3239             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3240           if (coding_mask & (1 << idx))
3241             {
3242 #if 0
3243               /* This code is suppressed until we find a better way to
3244                  distinguish raw text file and binary file.  */
3245
3246               if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3247                   && eol_type == CODING_EOL_INCONSISTENT)
3248                 val = Fcons (Qno_conversion, val);
3249               else
3250 #endif /* 0 */
3251                 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3252             }
3253         }
3254
3255       /* Then, change the order of the list, while getting subsidiary
3256          coding-systems.  */
3257       val2 = val;
3258       val = Qnil;
3259       if (eol_type == CODING_EOL_INCONSISTENT)
3260         eol_type == CODING_EOL_UNDECIDED;
3261       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3262         {
3263           if (eol_type == CODING_EOL_UNDECIDED)
3264             val = Fcons (XCONS (val2)->car, val);
3265           else
3266             {
3267               Lisp_Object val3;
3268               val3 = Fget (XCONS (val2)->car, Qeol_type);
3269               if (VECTORP (val3))
3270                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3271               else
3272                 val = Fcons (XCONS (val2)->car, val);
3273             }
3274         }
3275     }
3276
3277   return val;
3278 }
3279
3280 /* Scan text in the region between *BEGP and *ENDP, skip characters
3281    which we never have to encode to (iff ENCODEP is 1) or decode from
3282    coding system CODING at the head and tail, then set BEGP and ENDP
3283    to the addresses of start and end of the text we actually convert.  */
3284
3285 void
3286 shrink_conversion_area (begp, endp, coding, encodep)
3287      unsigned char **begp, **endp;
3288      struct coding_system *coding;
3289      int encodep;
3290 {
3291   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3292
3293   if (coding->eol_type != CODING_EOL_LF
3294       && coding->eol_type != CODING_EOL_UNDECIDED)
3295     /* Since we anyway have to convert end-of-line format, it is not
3296        worth skipping at most 100 bytes or so.  */
3297     return;
3298
3299   if (encodep)                  /* for encoding */
3300     {
3301       switch (coding->type)
3302         {
3303         case coding_type_no_conversion:
3304         case coding_type_emacs_mule:
3305         case coding_type_undecided:
3306         case coding_type_raw_text:
3307           /* We need no conversion.  */
3308           *begp = *endp;
3309           return;
3310         case coding_type_ccl:
3311           /* We can't skip any data.  */
3312           return;
3313         case coding_type_iso2022:
3314           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3315             {
3316               unsigned char *bol = beg_addr;
3317               while (beg_addr < end_addr && *beg_addr < 0x80)
3318                 {
3319                   beg_addr++;
3320                   if (*(beg_addr - 1) == '\n')
3321                     bol = beg_addr;
3322                 }
3323               beg_addr = bol;
3324               goto label_skip_tail;
3325             }
3326           /* fall down ... */
3327         default:
3328           /* We can skip all ASCII characters at the head and tail.  */
3329           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3330         label_skip_tail:
3331           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3332           break;
3333         }
3334     }
3335   else                          /* for decoding */
3336     {
3337       switch (coding->type)
3338         {
3339         case coding_type_no_conversion:
3340           /* We need no conversion.  */
3341           *begp = *endp;
3342           return;
3343         case coding_type_emacs_mule:
3344         case coding_type_raw_text:
3345           if (coding->eol_type == CODING_EOL_LF)
3346             {
3347               /* We need no conversion.  */
3348               *begp = *endp;
3349               return;
3350             }
3351           /* We can skip all but carriage-return.  */
3352           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3353           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3354           break;
3355         case coding_type_sjis:
3356         case coding_type_big5:
3357           /* We can skip all ASCII characters at the head.  */
3358           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3359           /* We can skip all ASCII characters at the tail except for
3360              the second byte of SJIS or BIG5 code.  */
3361           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3362           if (end_addr != *endp)
3363             end_addr++;
3364           break;
3365         case coding_type_ccl:
3366           /* We can't skip any data.  */
3367           return;
3368         default:                /* i.e. case coding_type_iso2022: */
3369           {
3370             unsigned char c;
3371
3372             /* We can skip all ASCII characters except for a few
3373                control codes at the head.  */
3374             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3375                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3376                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3377               beg_addr++;
3378           }
3379           break;
3380         }
3381     }
3382   *begp = beg_addr;
3383   *endp = end_addr;
3384   return;
3385 }
3386
3387 /* Encode into or decode from (according to ENCODEP) coding system CODING
3388    the text between char positions B and E.  */
3389
3390 Lisp_Object
3391 code_convert_region (b, e, coding, encodep)
3392      Lisp_Object b, e;
3393      struct coding_system *coding;
3394      int encodep;
3395 {
3396   int beg, end, len, consumed, produced;
3397   char *buf;
3398   unsigned char *begp, *endp;
3399   int opoint = PT, opoint_byte = PT_BYTE;
3400   int beg_byte, end_byte, len_byte;
3401   int zv_before = ZV;
3402   int zv_byte_before = ZV_BYTE;
3403
3404   validate_region (&b, &e);
3405   beg = XINT (b), end = XINT (e);
3406   beg_byte = CHAR_TO_BYTE (beg);
3407   end_byte = CHAR_TO_BYTE (end);
3408
3409   if (beg < GPT && end >= GPT)
3410     move_gap_both (end, end_byte);
3411
3412   if (encodep && !NILP (coding->pre_write_conversion))
3413     {
3414       /* We must call a pre-conversion function which may put a new
3415          text to be converted in a new buffer.  */
3416       struct buffer *old = current_buffer, *new;
3417
3418       TEMP_SET_PT_BOTH (beg, beg_byte);
3419       call2 (coding->pre_write_conversion, b, e);
3420       if (old != current_buffer)
3421         {
3422           /* Replace the original text by the text just generated.  */
3423           len = ZV - BEGV;
3424           len_byte = ZV_BYTE - BEGV_BYTE;
3425           new = current_buffer;
3426           set_buffer_internal (old);
3427           del_range_both (beg, end, beg_byte, end_byte, 1);
3428           insert_from_buffer (new, 1, len, 0);
3429           end = beg + len;
3430           end_byte = len_byte;
3431         }
3432     }
3433
3434   /* We may be able to shrink the conversion region.  */
3435   begp = BYTE_POS_ADDR (beg_byte);
3436   endp = begp + (end_byte - beg_byte);
3437   shrink_conversion_area (&begp, &endp, coding, encodep);
3438
3439   if (begp == endp)
3440     /* We need no conversion.  */
3441     len = end - beg;
3442   else
3443     {
3444       int shrunk_beg_byte, shrunk_end_byte;
3445       int shrunk_beg;
3446       int shrunk_len_byte;
3447       int new_len_byte;
3448       int buflen;
3449       int zv_before;
3450
3451       shrunk_beg_byte = PTR_BYTE_POS (begp);
3452       shrunk_beg = BYTE_TO_CHAR (shrunk_beg_byte);
3453       shrunk_end_byte = PTR_BYTE_POS (endp);
3454       shrunk_len_byte = shrunk_end_byte - shrunk_beg_byte;
3455
3456       if (encodep)
3457         buflen = encoding_buffer_size (coding, shrunk_len_byte);
3458       else
3459         buflen = decoding_buffer_size (coding, shrunk_len_byte);
3460       buf = get_conversion_buffer (buflen);
3461
3462       coding->last_block = 1;
3463       produced = (encodep
3464                   ? encode_coding (coding, begp, buf, shrunk_len_byte, buflen,
3465                                    &consumed)
3466                   : decode_coding (coding, begp, buf, shrunk_len_byte, buflen,
3467                                    &consumed));
3468
3469       TEMP_SET_PT_BOTH (shrunk_beg, shrunk_beg_byte);
3470
3471       if (encodep)
3472         /* If we just encoded, treat the result as single-byte.  */
3473         insert_1_both (buf, produced, produced, 0, 1, 0);
3474       else
3475         insert (buf, produced);
3476
3477       del_range_byte (PT_BYTE, PT_BYTE + shrunk_len_byte, 1);
3478
3479       if (opoint >= end)
3480         {
3481           opoint += ZV - zv_before;
3482           opoint_byte += ZV_BYTE - zv_byte_before;
3483         }
3484       else if (opoint > beg)
3485         {
3486           opoint = beg;
3487           opoint_byte = beg_byte;
3488         }
3489       TEMP_SET_PT_BOTH (opoint, opoint_byte);
3490
3491       end += ZV - zv_before;
3492     }
3493
3494   if (!encodep && !NILP (coding->post_read_conversion))
3495     {
3496       Lisp_Object insval;
3497
3498       /* We must call a post-conversion function which may alter
3499          the text just converted.  */
3500       zv_before = ZV;
3501       zv_byte_before = ZV_BYTE;
3502
3503       TEMP_SET_PT_BOTH (beg, beg_byte);
3504       insval = call1 (coding->post_read_conversion, make_number (end - beg));
3505       CHECK_NUMBER (insval, 0);
3506
3507       if (opoint >= beg + ZV - zv_before)
3508         {
3509           opoint += ZV - zv_before;
3510           opoint_byte += ZV_BYTE - zv_byte_before;
3511         }
3512       else if (opoint > beg)
3513         {
3514           opoint = beg;
3515           opoint_byte = beg_byte;
3516         }
3517       TEMP_SET_PT_BOTH (opoint, opoint_byte);
3518       len = XINT (insval);
3519     }
3520
3521   return make_number (len);
3522 }
3523
3524 /* Encode or decode (according to ENCODEP) the text of string STR
3525    using coding CODING.  If NOCOPY is nil, we never return STR
3526    itself, but always a copy.  If NOCOPY is non-nil, we return STR
3527    if no change is needed.  */
3528
3529 Lisp_Object
3530 code_convert_string (str, coding, encodep, nocopy)
3531      Lisp_Object str, nocopy;
3532      struct coding_system *coding;
3533      int encodep;
3534 {
3535   int len, consumed, produced;
3536   char *buf;
3537   unsigned char *begp, *endp;
3538   int head_skip, tail_skip;
3539   struct gcpro gcpro1;
3540
3541   if (encodep && !NILP (coding->pre_write_conversion)
3542       || !encodep && !NILP (coding->post_read_conversion))
3543     {
3544       /* Since we have to call Lisp functions which assume target text
3545          is in a buffer, after setting a temporary buffer, call
3546          code_convert_region.  */
3547       int count = specpdl_ptr - specpdl;
3548       int len = XSTRING (str)->size_byte;
3549       Lisp_Object result;
3550       struct buffer *old = current_buffer;
3551
3552       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3553       temp_output_buffer_setup (" *code-converting-work*");
3554       set_buffer_internal (XBUFFER (Vstandard_output));
3555       insert_from_string (str, 0, 0, XSTRING (str)->size, len, 0);
3556       code_convert_region (make_number (BEGV), make_number (ZV),
3557                            coding, encodep);
3558       result = make_buffer_string (BEGV, ZV, 0);
3559       set_buffer_internal (old);
3560       return unbind_to (count, result);
3561     }
3562
3563   /* We may be able to shrink the conversion region.  */
3564   begp = XSTRING (str)->data;
3565   endp = begp + XSTRING (str)->size_byte;
3566   shrink_conversion_area (&begp, &endp, coding, encodep);
3567
3568   if (begp == endp)
3569     /* We need no conversion.  */
3570     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3571
3572   /* We assume that head_skip and tail_skip count single-byte characters.  */
3573   head_skip = begp - XSTRING (str)->data;
3574   tail_skip = XSTRING (str)->size_byte - head_skip - (endp - begp);
3575
3576   GCPRO1 (str);
3577
3578   if (encodep)
3579     len = encoding_buffer_size (coding, endp - begp);
3580   else
3581     len = decoding_buffer_size (coding, endp - begp);
3582   buf = get_conversion_buffer (len + head_skip + tail_skip);
3583
3584   bcopy (XSTRING (str)->data, buf, head_skip);
3585   coding->last_block = 1;
3586   produced = (encodep
3587               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3588                                buf + head_skip, endp - begp, len, &consumed)
3589               : decode_coding (coding, XSTRING (str)->data + head_skip,
3590                                buf + head_skip, endp - begp, len, &consumed));
3591   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3592          buf + head_skip + produced,
3593          tail_skip);
3594
3595   UNGCPRO;
3596
3597   if (encodep)
3598     /* When encoding, the result is all single-byte characters.  */
3599     return make_unibyte_string (buf, head_skip + produced + tail_skip);
3600
3601   /* When decoding, count properly the number of chars in the string.  */
3602   return make_string (buf, head_skip + produced + tail_skip);
3603 }
3604
3605 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3606        3, 3, "r\nzCoding system: ",
3607   "Decode current region by specified coding system.\n\
3608 When called from a program, takes three arguments:\n\
3609 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3610 Return length of decoded text.")
3611   (b, e, coding_system)
3612      Lisp_Object b, e, coding_system;
3613 {
3614   struct coding_system coding;
3615
3616   CHECK_NUMBER_COERCE_MARKER (b, 0);
3617   CHECK_NUMBER_COERCE_MARKER (e, 1);
3618   CHECK_SYMBOL (coding_system, 2);
3619
3620   if (NILP (coding_system))
3621     return make_number (XFASTINT (e) - XFASTINT (b));
3622   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3623     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3624
3625   return code_convert_region (b, e, &coding, 0);
3626 }
3627
3628 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3629        3, 3, "r\nzCoding system: ",
3630   "Encode current region by specified coding system.\n\
3631 When called from a program, takes three arguments:\n\
3632 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3633 Return length of encoded text.")
3634   (b, e, coding_system)
3635      Lisp_Object b, e, coding_system;
3636 {
3637   struct coding_system coding;
3638
3639   CHECK_NUMBER_COERCE_MARKER (b, 0);
3640   CHECK_NUMBER_COERCE_MARKER (e, 1);
3641   CHECK_SYMBOL (coding_system, 2);
3642
3643   if (NILP (coding_system))
3644     return make_number (XFASTINT (e) - XFASTINT (b));
3645   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3646     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3647
3648   return code_convert_region (b, e, &coding, 1);
3649 }
3650
3651 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3652        2, 3, 0,
3653   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3654 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3655 if the decoding operation is trivial.")
3656   (string, coding_system, nocopy)
3657      Lisp_Object string, coding_system, nocopy;
3658 {
3659   struct coding_system coding;
3660
3661   CHECK_STRING (string, 0);
3662   CHECK_SYMBOL (coding_system, 1);
3663
3664   if (NILP (coding_system))
3665     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3666   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3667     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3668
3669   return code_convert_string (string, &coding, 0, nocopy);
3670 }
3671
3672 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3673        2, 3, 0,
3674   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3675 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3676 if the encoding operation is trivial.")
3677   (string, coding_system, nocopy)
3678      Lisp_Object string, coding_system, nocopy;
3679 {
3680   struct coding_system coding;
3681
3682   CHECK_STRING (string, 0);
3683   CHECK_SYMBOL (coding_system, 1);
3684
3685   if (NILP (coding_system))
3686     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3687   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3688     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3689
3690   return code_convert_string (string, &coding, 1, nocopy);
3691 }
3692
3693 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3694   "Decode a JISX0208 character of shift-jis encoding.\n\
3695 CODE is the character code in SJIS.\n\
3696 Return the corresponding character.")
3697   (code)
3698      Lisp_Object code;
3699 {
3700   unsigned char c1, c2, s1, s2;
3701   Lisp_Object val;
3702
3703   CHECK_NUMBER (code, 0);
3704   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3705   DECODE_SJIS (s1, s2, c1, c2);
3706   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3707   return val;
3708 }
3709
3710 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3711   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3712 Return the corresponding character code in SJIS.")
3713   (ch)
3714      Lisp_Object ch;
3715 {
3716   int charset, c1, c2, s1, s2;
3717   Lisp_Object val;
3718
3719   CHECK_NUMBER (ch, 0);
3720   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3721   if (charset == charset_jisx0208)
3722     {
3723       ENCODE_SJIS (c1, c2, s1, s2);
3724       XSETFASTINT (val, (s1 << 8) | s2);
3725     }
3726   else
3727     XSETFASTINT (val, 0);
3728   return val;
3729 }
3730
3731 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3732   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3733 CODE is the character code in BIG5.\n\
3734 Return the corresponding character.")
3735   (code)
3736      Lisp_Object code;
3737 {
3738   int charset;
3739   unsigned char b1, b2, c1, c2;
3740   Lisp_Object val;
3741
3742   CHECK_NUMBER (code, 0);
3743   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3744   DECODE_BIG5 (b1, b2, charset, c1, c2);
3745   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3746   return val;
3747 }
3748
3749 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3750   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3751 Return the corresponding character code in Big5.")
3752   (ch)
3753      Lisp_Object ch;
3754 {
3755   int charset, c1, c2, b1, b2;
3756   Lisp_Object val;
3757
3758   CHECK_NUMBER (ch, 0);
3759   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3760   if (charset == charset_big5_1 || charset == charset_big5_2)
3761     {
3762       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3763       XSETFASTINT (val, (b1 << 8) | b2);
3764     }
3765   else
3766     XSETFASTINT (val, 0);
3767   return val;
3768 }
3769
3770 DEFUN ("set-terminal-coding-system-internal",
3771        Fset_terminal_coding_system_internal,
3772        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3773   (coding_system)
3774      Lisp_Object coding_system;
3775 {
3776   CHECK_SYMBOL (coding_system, 0);
3777   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3778   /* We had better not send unsafe characters to terminal.  */
3779   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3780
3781   return Qnil;
3782 }
3783
3784 DEFUN ("set-safe-terminal-coding-system-internal",
3785        Fset_safe_terminal_coding_system_internal,
3786        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3787   (coding_system)
3788      Lisp_Object coding_system;
3789 {
3790   CHECK_SYMBOL (coding_system, 0);
3791   setup_coding_system (Fcheck_coding_system (coding_system),
3792                        &safe_terminal_coding);
3793   return Qnil;
3794 }
3795
3796 DEFUN ("terminal-coding-system",
3797        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3798   "Return coding-system of your terminal.")
3799   ()
3800 {
3801   return terminal_coding.symbol;
3802 }
3803
3804 DEFUN ("set-keyboard-coding-system-internal",
3805        Fset_keyboard_coding_system_internal,
3806        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3807   (coding_system)
3808      Lisp_Object coding_system;
3809 {
3810   CHECK_SYMBOL (coding_system, 0);
3811   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3812   return Qnil;
3813 }
3814
3815 DEFUN ("keyboard-coding-system",
3816        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3817   "Return coding-system of what is sent from terminal keyboard.")
3818   ()
3819 {
3820   return keyboard_coding.symbol;
3821 }
3822
3823 \f
3824 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3825        Sfind_operation_coding_system,  1, MANY, 0,
3826   "Choose a coding system for an operation based on the target name.\n\
3827 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3828 DECODING-SYSTEM is the coding system to use for decoding\n\
3829 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3830 for encoding (in case OPERATION does encoding).\n\
3831 \n\
3832 The first argument OPERATION specifies an I/O primitive:\n\
3833   For file I/O, `insert-file-contents' or `write-region'.\n\
3834   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3835   For network I/O, `open-network-stream'.\n\
3836 \n\
3837 The remaining arguments should be the same arguments that were passed\n\
3838 to the primitive.  Depending on which primitive, one of those arguments\n\
3839 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3840 whichever argument specifies the file name is TARGET.\n\
3841 \n\
3842 TARGET has a meaning which depends on OPERATION:\n\
3843   For file I/O, TARGET is a file name.\n\
3844   For process I/O, TARGET is a process name.\n\
3845   For network I/O, TARGET is a service name or a port number\n\
3846 \n\
3847 This function looks up what specified for TARGET in,\n\
3848 `file-coding-system-alist', `process-coding-system-alist',\n\
3849 or `network-coding-system-alist' depending on OPERATION.\n\
3850 They may specify a coding system, a cons of coding systems,\n\
3851 or a function symbol to call.\n\
3852 In the last case, we call the function with one argument,\n\
3853 which is a list of all the arguments given to this function.")
3854   (nargs, args)
3855      int nargs;
3856      Lisp_Object *args;
3857 {
3858   Lisp_Object operation, target_idx, target, val;
3859   register Lisp_Object chain;
3860
3861   if (nargs < 2)
3862     error ("Too few arguments");
3863   operation = args[0];
3864   if (!SYMBOLP (operation)
3865       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3866     error ("Invalid first arguement");
3867   if (nargs < 1 + XINT (target_idx))
3868     error ("Too few arguments for operation: %s",
3869            XSYMBOL (operation)->name->data);
3870   target = args[XINT (target_idx) + 1];
3871   if (!(STRINGP (target)
3872         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3873     error ("Invalid %dth argument", XINT (target_idx) + 1);
3874
3875   chain = ((EQ (operation, Qinsert_file_contents)
3876             || EQ (operation, Qwrite_region))
3877            ? Vfile_coding_system_alist
3878            : (EQ (operation, Qopen_network_stream)
3879               ? Vnetwork_coding_system_alist
3880               : Vprocess_coding_system_alist));
3881   if (NILP (chain))
3882     return Qnil;
3883
3884   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3885     {
3886       Lisp_Object elt;
3887       elt = XCONS (chain)->car;
3888
3889       if (CONSP (elt)
3890           && ((STRINGP (target)
3891                && STRINGP (XCONS (elt)->car)
3892                && fast_string_match (XCONS (elt)->car, target) >= 0)
3893               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3894         {
3895           val = XCONS (elt)->cdr;
3896           /* Here, if VAL is both a valid coding system and a valid
3897              function symbol, we return VAL as a coding system.  */
3898           if (CONSP (val))
3899             return val;
3900           if (! SYMBOLP (val))
3901             return Qnil;
3902           if (! NILP (Fcoding_system_p (val)))
3903             return Fcons (val, val);
3904           if (! NILP (Ffboundp (val)))
3905             {
3906               val = call1 (val, Flist (nargs, args));
3907               if (CONSP (val))
3908                 return val;
3909               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
3910                 return Fcons (val, val);
3911             }
3912           return Qnil;
3913         }
3914     }
3915   return Qnil;
3916 }
3917
3918 #endif /* emacs */
3919
3920 \f
3921 /*** 8. Post-amble ***/
3922
3923 init_coding_once ()
3924 {
3925   int i;
3926
3927   /* Emacs' internal format specific initialize routine.  */
3928   for (i = 0; i <= 0x20; i++)
3929     emacs_code_class[i] = EMACS_control_code;
3930   emacs_code_class[0x0A] = EMACS_linefeed_code;
3931   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3932   for (i = 0x21 ; i < 0x7F; i++)
3933     emacs_code_class[i] = EMACS_ascii_code;
3934   emacs_code_class[0x7F] = EMACS_control_code;
3935   emacs_code_class[0x80] = EMACS_leading_code_composition;
3936   for (i = 0x81; i < 0xFF; i++)
3937     emacs_code_class[i] = EMACS_invalid_code;
3938   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3939   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3940   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3941   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3942
3943   /* ISO2022 specific initialize routine.  */
3944   for (i = 0; i < 0x20; i++)
3945     iso_code_class[i] = ISO_control_code;
3946   for (i = 0x21; i < 0x7F; i++)
3947     iso_code_class[i] = ISO_graphic_plane_0;
3948   for (i = 0x80; i < 0xA0; i++)
3949     iso_code_class[i] = ISO_control_code;
3950   for (i = 0xA1; i < 0xFF; i++)
3951     iso_code_class[i] = ISO_graphic_plane_1;
3952   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3953   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3954   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3955   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3956   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3957   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3958   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3959   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3960   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3961   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3962
3963   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3964   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3965
3966   setup_coding_system (Qnil, &keyboard_coding);
3967   setup_coding_system (Qnil, &terminal_coding);
3968   setup_coding_system (Qnil, &safe_terminal_coding);
3969
3970 #if defined (MSDOS) || defined (WINDOWSNT)
3971   system_eol_type = CODING_EOL_CRLF;
3972 #else
3973   system_eol_type = CODING_EOL_LF;
3974 #endif
3975 }
3976
3977 #ifdef emacs
3978
3979 syms_of_coding ()
3980 {
3981   Qtarget_idx = intern ("target-idx");
3982   staticpro (&Qtarget_idx);
3983
3984   Qcoding_system_history = intern ("coding-system-history");
3985   staticpro (&Qcoding_system_history);
3986   Fset (Qcoding_system_history, Qnil);
3987
3988   /* Target FILENAME is the first argument.  */
3989   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3990   /* Target FILENAME is the third argument.  */
3991   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3992
3993   Qcall_process = intern ("call-process");
3994   staticpro (&Qcall_process);
3995   /* Target PROGRAM is the first argument.  */
3996   Fput (Qcall_process, Qtarget_idx, make_number (0));
3997
3998   Qcall_process_region = intern ("call-process-region");
3999   staticpro (&Qcall_process_region);
4000   /* Target PROGRAM is the third argument.  */
4001   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
4002
4003   Qstart_process = intern ("start-process");
4004   staticpro (&Qstart_process);
4005   /* Target PROGRAM is the third argument.  */
4006   Fput (Qstart_process, Qtarget_idx, make_number (2));
4007
4008   Qopen_network_stream = intern ("open-network-stream");
4009   staticpro (&Qopen_network_stream);
4010   /* Target SERVICE is the fourth argument.  */
4011   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
4012
4013   Qcoding_system = intern ("coding-system");
4014   staticpro (&Qcoding_system);
4015
4016   Qeol_type = intern ("eol-type");
4017   staticpro (&Qeol_type);
4018
4019   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
4020   staticpro (&Qbuffer_file_coding_system);
4021
4022   Qpost_read_conversion = intern ("post-read-conversion");
4023   staticpro (&Qpost_read_conversion);
4024
4025   Qpre_write_conversion = intern ("pre-write-conversion");
4026   staticpro (&Qpre_write_conversion);
4027
4028   Qno_conversion = intern ("no-conversion");
4029   staticpro (&Qno_conversion);
4030
4031   Qundecided = intern ("undecided");
4032   staticpro (&Qundecided);
4033
4034   Qcoding_system_p = intern ("coding-system-p");
4035   staticpro (&Qcoding_system_p);
4036
4037   Qcoding_system_error = intern ("coding-system-error");
4038   staticpro (&Qcoding_system_error);
4039
4040   Fput (Qcoding_system_error, Qerror_conditions,
4041         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
4042   Fput (Qcoding_system_error, Qerror_message,
4043         build_string ("Invalid coding system"));
4044
4045   Qcoding_category_index = intern ("coding-category-index");
4046   staticpro (&Qcoding_category_index);
4047
4048   {
4049     int i;
4050     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4051       {
4052         coding_category_table[i] = intern (coding_category_name[i]);
4053         staticpro (&coding_category_table[i]);
4054         Fput (coding_category_table[i], Qcoding_category_index,
4055               make_number (i));
4056       }
4057   }
4058
4059   Qcharacter_unification_table = intern ("character-unification-table");
4060   staticpro (&Qcharacter_unification_table);
4061   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
4062         make_number (0));
4063
4064   Qcharacter_unification_table_for_decode
4065     = intern ("character-unification-table-for-decode");
4066   staticpro (&Qcharacter_unification_table_for_decode);
4067
4068   Qcharacter_unification_table_for_encode
4069     = intern ("character-unification-table-for-encode");
4070   staticpro (&Qcharacter_unification_table_for_encode);
4071
4072   Qsafe_charsets = intern ("safe-charsets");
4073   staticpro (&Qsafe_charsets);
4074
4075   Qemacs_mule = intern ("emacs-mule");
4076   staticpro (&Qemacs_mule);
4077
4078   defsubr (&Scoding_system_p);
4079   defsubr (&Sread_coding_system);
4080   defsubr (&Sread_non_nil_coding_system);
4081   defsubr (&Scheck_coding_system);
4082   defsubr (&Sdetect_coding_region);
4083   defsubr (&Sdecode_coding_region);
4084   defsubr (&Sencode_coding_region);
4085   defsubr (&Sdecode_coding_string);
4086   defsubr (&Sencode_coding_string);
4087   defsubr (&Sdecode_sjis_char);
4088   defsubr (&Sencode_sjis_char);
4089   defsubr (&Sdecode_big5_char);
4090   defsubr (&Sencode_big5_char);
4091   defsubr (&Sset_terminal_coding_system_internal);
4092   defsubr (&Sset_safe_terminal_coding_system_internal);
4093   defsubr (&Sterminal_coding_system);
4094   defsubr (&Sset_keyboard_coding_system_internal);
4095   defsubr (&Skeyboard_coding_system);
4096   defsubr (&Sfind_operation_coding_system);
4097
4098   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
4099     "List of coding systems.\n\
4100 \n\
4101 Do not alter the value of this variable manually.  This variable should be\n\
4102 updated by the functions `make-coding-system' and\n\
4103 `define-coding-system-alias'.");
4104   Vcoding_system_list = Qnil;
4105
4106   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
4107     "Alist of coding system names.\n\
4108 Each element is one element list of coding system name.\n\
4109 This variable is given to `completing-read' as TABLE argument.\n\
4110 \n\
4111 Do not alter the value of this variable manually.  This variable should be\n\
4112 updated by the functions `make-coding-system' and\n\
4113 `define-coding-system-alias'.");
4114   Vcoding_system_alist = Qnil;
4115
4116   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
4117     "List of coding-categories (symbols) ordered by priority.");
4118   {
4119     int i;
4120
4121     Vcoding_category_list = Qnil;
4122     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
4123       Vcoding_category_list
4124         = Fcons (coding_category_table[i], Vcoding_category_list);
4125   }
4126
4127   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
4128     "Specify the coding system for read operations.\n\
4129 It is useful to bind this variable with `let', but do not set it globally.\n\
4130 If the value is a coding system, it is used for decoding on read operation.\n\
4131 If not, an appropriate element is used from one of the coding system alists:\n\
4132 There are three such tables, `file-coding-system-alist',\n\
4133 `process-coding-system-alist', and `network-coding-system-alist'.");
4134   Vcoding_system_for_read = Qnil;
4135
4136   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
4137     "Specify the coding system for write operations.\n\
4138 It is useful to bind this variable with `let', but do not set it globally.\n\
4139 If the value is a coding system, it is used for encoding on write operation.\n\
4140 If not, an appropriate element is used from one of the coding system alists:\n\
4141 There are three such tables, `file-coding-system-alist',\n\
4142 `process-coding-system-alist', and `network-coding-system-alist'.");
4143   Vcoding_system_for_write = Qnil;
4144
4145   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4146     "Coding system used in the latest file or process I/O.");
4147   Vlast_coding_system_used = Qnil;
4148
4149   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4150     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4151   inhibit_eol_conversion = 0;
4152
4153   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4154     "Alist to decide a coding system to use for a file I/O operation.\n\
4155 The format is ((PATTERN . VAL) ...),\n\
4156 where PATTERN is a regular expression matching a file name,\n\
4157 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4158 If VAL is a coding system, it is used for both decoding and encoding\n\
4159 the file contents.\n\
4160 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4161 and the cdr part is used for encoding.\n\
4162 If VAL is a function symbol, the function must return a coding system\n\
4163 or a cons of coding systems which are used as above.\n\
4164 \n\
4165 See also the function `find-operation-coding-system'.");
4166   Vfile_coding_system_alist = Qnil;
4167
4168   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4169     "Alist to decide a coding system to use for a process I/O operation.\n\
4170 The format is ((PATTERN . VAL) ...),\n\
4171 where PATTERN is a regular expression matching a program name,\n\
4172 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4173 If VAL is a coding system, it is used for both decoding what received\n\
4174 from the program and encoding what sent to the program.\n\
4175 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4176 and the cdr part is used for encoding.\n\
4177 If VAL is a function symbol, the function must return a coding system\n\
4178 or a cons of coding systems which are used as above.\n\
4179 \n\
4180 See also the function `find-operation-coding-system'.");
4181   Vprocess_coding_system_alist = Qnil;
4182
4183   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4184     "Alist to decide a coding system to use for a network I/O operation.\n\
4185 The format is ((PATTERN . VAL) ...),\n\
4186 where PATTERN is a regular expression matching a network service name\n\
4187 or is a port number to connect to,\n\
4188 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4189 If VAL is a coding system, it is used for both decoding what received\n\
4190 from the network stream and encoding what sent to the network stream.\n\
4191 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4192 and the cdr part is used for encoding.\n\
4193 If VAL is a function symbol, the function must return a coding system\n\
4194 or a cons of coding systems which are used as above.\n\
4195 \n\
4196 See also the function `find-operation-coding-system'.");
4197   Vnetwork_coding_system_alist = Qnil;
4198
4199   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4200     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4201   eol_mnemonic_unix = ':';
4202
4203   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4204     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4205   eol_mnemonic_dos = '\\';
4206
4207   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4208     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4209   eol_mnemonic_mac = '/';
4210
4211   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4212     "Mnemonic character indicating end-of-line format is not yet decided.");
4213   eol_mnemonic_undecided = ':';
4214
4215   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4216     "Non-nil means ISO 2022 encoder/decoder do character unification.");
4217   Venable_character_unification = Qt;
4218
4219   DEFVAR_LISP ("standard-character-unification-table-for-decode",
4220     &Vstandard_character_unification_table_for_decode,
4221     "Table for unifying characters when reading.");
4222   Vstandard_character_unification_table_for_decode = Qnil;
4223
4224   DEFVAR_LISP ("standard-character-unification-table-for-encode",
4225     &Vstandard_character_unification_table_for_encode,
4226     "Table for unifying characters when writing.");
4227   Vstandard_character_unification_table_for_encode = Qnil;
4228
4229   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4230     "Alist of charsets vs revision numbers.\n\
4231 While encoding, if a charset (car part of an element) is found,\n\
4232 designate it with the escape sequence identifing revision (cdr part of the element).");
4233   Vcharset_revision_alist = Qnil;
4234
4235   DEFVAR_LISP ("default-process-coding-system",
4236                &Vdefault_process_coding_system,
4237     "Cons of coding systems used for process I/O by default.\n\
4238 The car part is used for decoding a process output,\n\
4239 the cdr part is used for encoding a text to be sent to a process.");
4240   Vdefault_process_coding_system = Qnil;
4241
4242   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4243     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4244 This is a vector of length 256.\n\
4245 If Nth element is non-nil, the existence of code N in a file\n\
4246 \(or output of subprocess) doesn't prevent it to be detected as\n\
4247 a coding system of ISO 2022 variant which has a flag\n\
4248 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4249 or reading output of a subprocess.\n\
4250 Only 128th through 159th elements has a meaning.");
4251   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4252 }
4253
4254 #endif /* emacs */