src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Raw text
  71
  72   A coding system to for a text containing random 8-bit code.  Emacs
  73   does no code conversion on such a text except for end-of-line
  74   format.
  75
  76   5. Other
  77
  78   If a user wants to read/write a text encoded in a coding system not
  79   listed above, he can supply a decoder and an encoder for it in CCL
  80   (Code Conversion Language) programs.  Emacs executes the CCL program
  81   while reading/writing.
  82
  83   Emacs represents a coding-system by a Lisp symbol that has a property
  84   `coding-system'.  But, before actually using the coding-system, the
  85   information about it is set in a structure of type `struct
  86   coding_system' for rapid processing.  See section 6 for more details.
  87
  88 */
  89
  90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  91
  92   How end-of-line of a text is encoded depends on a system.  For
  93   instance, Unix's format is just one byte of `line-feed' code,
  94   whereas DOS's format is two-byte sequence of `carriage-return' and
  95   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  96
  97   Since text characters encoding and end-of-line encoding are
  98   independent, any coding system described above can take
  99   any format of end-of-line.  So, Emacs has information of format of
 100   end-of-line in each coding-system.  See section 6 for more details.
 101
 102 */
 103
 104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 105
 106   These functions check if a text between SRC and SRC_END is encoded
 107   in the coding system category XXX.  Each returns an integer value in
 108   which appropriate flag bits for the category XXX is set.  The flag
 109   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 110   template of these functions.  */
 111 #if 0
 112 int
 113 detect_coding_emacs_mule (src, src_end)
 114      unsigned char *src, *src_end;
 115 {
 116   ...
 117 }
 118 #endif
 119
 120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 121
 122   These functions decode SRC_BYTES length text at SOURCE encoded in
 123   CODING to Emacs' internal format (emacs-mule).  The resulting text
 124   goes to a place pointed to by DESTINATION, the length of which should
 125   not exceed DST_BYTES.  The number of bytes actually processed is
 126   returned as *CONSUMED.  The return value is the length of the decoded
 127   text.  Below is a template of these functions.  */
 128 #if 0
 129 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 130      struct coding_system *coding;
 131      unsigned char *source, *destination;
 132      int src_bytes, dst_bytes;
 133      int *consumed;
 134 {
 135   ...
 136 }
 137 #endif
 138
 139 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 140
 141   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 142   internal format (emacs-mule) to CODING.  The resulting text goes to
 143   a place pointed to by DESTINATION, the length of which should not
 144   exceed DST_BYTES.  The number of bytes actually processed is
 145   returned as *CONSUMED.  The return value is the length of the
 146   encoded text.  Below is a template of these functions.  */
 147 #if 0
 148 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152      int *consumed;
 153 {
 154   ...
 155 }
 156 #endif
 157
 158 /*** COMMONLY USED MACROS ***/
 159
 160 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 161    THREE_MORE_BYTES safely get one, two, and three bytes from the
 162    source text respectively.  If there are not enough bytes in the
 163    source, they jump to `label_end_of_loop'.  The caller should set
 164    variables `src' and `src_end' to appropriate areas in advance.  */
 165
 166 #define ONE_MORE_BYTE(c1)       \
 167   do {                          \
 168     if (src < src_end)          \
 169       c1 = *src++;              \
 170     else                        \
 171       goto label_end_of_loop;   \
 172   } while (0)
 173
 174 #define TWO_MORE_BYTES(c1, c2)  \
 175   do {                          \
 176     if (src + 1 < src_end)      \
 177       c1 = *src++, c2 = *src++; \
 178     else                        \
 179       goto label_end_of_loop;   \
 180   } while (0)
 181
 182 #define THREE_MORE_BYTES(c1, c2, c3)            \
 183   do {                                          \
 184     if (src + 2 < src_end)                      \
 185       c1 = *src++, c2 = *src++, c3 = *src++;    \
 186     else                                        \
 187       goto label_end_of_loop;                   \
 188   } while (0)
 189
 190 /* The following three macros DECODE_CHARACTER_ASCII,
 191    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 192    the multi-byte form of a character of each class at the place
 193    pointed by `dst'.  The caller should set the variable `dst' to
 194    point to an appropriate area and the variable `coding' to point to
 195    the coding-system of the currently decoding text in advance.  */
 196
 197 /* Decode one ASCII character C.  */
 198
 199 #define DECODE_CHARACTER_ASCII(c)                               \
 200   do {                                                          \
 201     if (COMPOSING_P (coding->composing))                        \
 202       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 203     else                                                        \
 204       *dst++ = (c);                                             \
 205   } while (0)
 206
 207 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 208    position-code is C.  */
 209
 210 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 211   do {                                                                  \
 212     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 213     if (COMPOSING_P (coding->composing))                                \
 214       *dst++ = leading_code + 0x20;                                     \
 215     else                                                                \
 216       *dst++ = leading_code;                                            \
 217     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 218       *dst++ = leading_code;                                            \
 219     *dst++ = (c) | 0x80;                                                \
 220   } while (0)
 221
 222 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 223    position-codes are C1 and C2.  */
 224
 225 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 226   do {                                                  \
 227     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 228     *dst++ = (c2) | 0x80;                               \
 229   } while (0)
 230
 231 \f
 232 /*** 1. Preamble ***/
 233
 234 #include <stdio.h>
 235
 236 #ifdef emacs
 237
 238 #include <config.h>
 239 #include "lisp.h"
 240 #include "buffer.h"
 241 #include "charset.h"
 242 #include "ccl.h"
 243 #include "coding.h"
 244 #include "window.h"
 245
 246 #else  /* not emacs */
 247
 248 #include "mulelib.h"
 249
 250 #endif /* not emacs */
 251
 252 Lisp_Object Qcoding_system, Qeol_type;
 253 Lisp_Object Qbuffer_file_coding_system;
 254 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 255 Lisp_Object Qno_conversion, Qundecided;
 256
 257 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 258 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 259 Lisp_Object Qstart_process, Qopen_network_stream;
 260 Lisp_Object Qtarget_idx;
 261
 262 /* Mnemonic character of each format of end-of-line.  */
 263 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 264 /* Mnemonic character to indicate format of end-of-line is not yet
 265    decided.  */
 266 int eol_mnemonic_undecided;
 267
 268 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 269    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 270 int system_eol_type;
 271
 272 #ifdef emacs
 273
 274 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 275
 276 /* Coding system emacs-mule is for converting only end-of-line format.  */
 277 Lisp_Object Qemacs_mule;
 278
 279 /* Coding-systems are handed between Emacs Lisp programs and C internal
 280    routines by the following three variables.  */
 281 /* Coding-system for reading files and receiving data from process.  */
 282 Lisp_Object Vcoding_system_for_read;
 283 /* Coding-system for writing files and sending data to process.  */
 284 Lisp_Object Vcoding_system_for_write;
 285 /* Coding-system actually used in the latest I/O.  */
 286 Lisp_Object Vlast_coding_system_used;
 287
 288 /* A vector of length 256 which contains information about special
 289    Latin codes (espepcially for dealing with Microsoft code).  */
 290 Lisp_Object Vlatin_extra_code_table;
 291
 292 /* Flag to inhibit code conversion of end-of-line format.  */
 293 int inhibit_eol_conversion;
 294
 295 /* Coding system to be used to encode text for terminal display.  */
 296 struct coding_system terminal_coding;
 297
 298 /* Coding system to be used to encode text for terminal display when
 299    terminal coding system is nil.  */
 300 struct coding_system safe_terminal_coding;
 301
 302 /* Coding system of what is sent from terminal keyboard.  */
 303 struct coding_system keyboard_coding;
 304
 305 Lisp_Object Vfile_coding_system_alist;
 306 Lisp_Object Vprocess_coding_system_alist;
 307 Lisp_Object Vnetwork_coding_system_alist;
 308
 309 #endif /* emacs */
 310
 311 Lisp_Object Qcoding_category_index;
 312
 313 /* List of symbols `coding-category-xxx' ordered by priority.  */
 314 Lisp_Object Vcoding_category_list;
 315
 316 /* Table of coding-systems currently assigned to each coding-category.  */
 317 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 318
 319 /* Table of names of symbol for each coding-category.  */
 320 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 321   "coding-category-emacs-mule",
 322   "coding-category-sjis",
 323   "coding-category-iso-7",
 324   "coding-category-iso-8-1",
 325   "coding-category-iso-8-2",
 326   "coding-category-iso-7-else",
 327   "coding-category-iso-8-else",
 328   "coding-category-big5",
 329   "coding-category-raw-text",
 330   "coding-category-binary"
 331 };
 332
 333 /* Flag to tell if we look up unification table on character code
 334    conversion.  */
 335 Lisp_Object Venable_character_unification;
 336 /* Standard unification table to look up on decoding (reading).  */
 337 Lisp_Object Vstandard_character_unification_table_for_decode;
 338 /* Standard unification table to look up on encoding (writing).  */
 339 Lisp_Object Vstandard_character_unification_table_for_encode;
 340
 341 Lisp_Object Qcharacter_unification_table;
 342 Lisp_Object Qcharacter_unification_table_for_decode;
 343 Lisp_Object Qcharacter_unification_table_for_encode;
 344
 345 /* Alist of charsets vs revision number.  */
 346 Lisp_Object Vcharset_revision_alist;
 347
 348 /* Default coding systems used for process I/O.  */
 349 Lisp_Object Vdefault_process_coding_system;
 350
 351 \f
 352 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 353
 354 /* Emacs' internal format for encoding multiple character sets is a
 355    kind of multi-byte encoding, i.e. characters are encoded by
 356    variable-length sequences of one-byte codes.  ASCII characters
 357    and control characters (e.g. `tab', `newline') are represented by
 358    one-byte sequences which are their ASCII codes, in the range 0x00
 359    through 0x7F.  The other characters are represented by a sequence
 360    of `base leading-code', optional `extended leading-code', and one
 361    or two `position-code's.  The length of the sequence is determined
 362    by the base leading-code.  Leading-code takes the range 0x80
 363    through 0x9F, whereas extended leading-code and position-code take
 364    the range 0xA0 through 0xFF.  See `charset.h' for more details
 365    about leading-code and position-code.
 366
 367    There's one exception to this rule.  Special leading-code
 368    `leading-code-composition' denotes that the following several
 369    characters should be composed into one character.  Leading-codes of
 370    components (except for ASCII) are added 0x20.  An ASCII character
 371    component is represented by a 2-byte sequence of `0xA0' and
 372    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 373    details of composite character.  Hence, we can summarize the code
 374    range as follows:
 375
 376    --- CODE RANGE of Emacs' internal format ---
 377    (character set)      (range)
 378    ASCII                0x00 .. 0x7F
 379    ELSE (1st byte)      0x80 .. 0x9F
 380         (rest bytes)    0xA0 .. 0xFF
 381    ---------------------------------------------
 382
 383   */
 384
 385 enum emacs_code_class_type emacs_code_class[256];
 386
 387 /* Go to the next statement only if *SRC is accessible and the code is
 388    greater than 0xA0.  */
 389 #define CHECK_CODE_RANGE_A0_FF  \
 390   do {                          \
 391     if (src >= src_end)         \
 392       goto label_end_of_switch; \
 393     else if (*src++ < 0xA0)     \
 394       return 0;                 \
 395   } while (0)
 396
 397 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 398    Check if a text is encoded in Emacs' internal format.  If it is,
 399    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 400
 401 int
 402 detect_coding_emacs_mule (src, src_end)
 403      unsigned char *src, *src_end;
 404 {
 405   unsigned char c;
 406   int composing = 0;
 407
 408   while (src < src_end)
 409     {
 410       c = *src++;
 411
 412       if (composing)
 413         {
 414           if (c < 0xA0)
 415             composing = 0;
 416           else
 417             c -= 0x20;
 418         }
 419
 420       switch (emacs_code_class[c])
 421         {
 422         case EMACS_ascii_code:
 423         case EMACS_linefeed_code:
 424           break;
 425
 426         case EMACS_control_code:
 427           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 428             return 0;
 429           break;
 430
 431         case EMACS_invalid_code:
 432           return 0;
 433
 434         case EMACS_leading_code_composition: /* c == 0x80 */
 435           if (composing)
 436             CHECK_CODE_RANGE_A0_FF;
 437           else
 438             composing = 1;
 439           break;
 440
 441         case EMACS_leading_code_4:
 442           CHECK_CODE_RANGE_A0_FF;
 443           /* fall down to check it two more times ...  */
 444
 445         case EMACS_leading_code_3:
 446           CHECK_CODE_RANGE_A0_FF;
 447           /* fall down to check it one more time ...  */
 448
 449         case EMACS_leading_code_2:
 450           CHECK_CODE_RANGE_A0_FF;
 451           break;
 452
 453         default:
 454         label_end_of_switch:
 455           break;
 456         }
 457     }
 458   return CODING_CATEGORY_MASK_EMACS_MULE;
 459 }
 460
 461 \f
 462 /*** 3. ISO2022 handlers ***/
 463
 464 /* The following note describes the coding system ISO2022 briefly.
 465    Since the intention of this note is to help in understanding of
 466    the programs in this file, some parts are NOT ACCURATE or OVERLY
 467    SIMPLIFIED.  For the thorough understanding, please refer to the
 468    original document of ISO2022.
 469
 470    ISO2022 provides many mechanisms to encode several character sets
 471    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 472    all text is encoded by codes of less than 128.  This may make the
 473    encoded text a little bit longer, but the text gets more stability
 474    to pass through several gateways (some of them strip off the MSB).
 475
 476    There are two kinds of character set: control character set and
 477    graphic character set.  The former contains control characters such
 478    as `newline' and `escape' to provide control functions (control
 479    functions are provided also by escape sequences).  The latter
 480    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 481    two control character sets and many graphic character sets.
 482
 483    Graphic character sets are classified into one of the following
 484    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 485    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 486    bytes (DIMENSION) and the number of characters in one dimension
 487    (CHARS) of the set.  In addition, each character set is assigned an
 488    identification tag (called "final character" and denoted as <F>
 489    here after) which is unique in each class.  <F> of each character
 490    set is decided by ECMA(*) when it is registered in ISO.  Code range
 491    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 492
 493    Note (*): ECMA = European Computer Manufacturers Association
 494
 495    Here are examples of graphic character set [NAME(<F>)]:
 496         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 497         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 498         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 499         o DIMENSION2_CHARS96 -- none for the moment
 500
 501    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 502         C0 [0x00..0x1F] -- control character plane 0
 503         GL [0x20..0x7F] -- graphic character plane 0
 504         C1 [0x80..0x9F] -- control character plane 1
 505         GR [0xA0..0xFF] -- graphic character plane 1
 506
 507    A control character set is directly designated and invoked to C0 or
 508    C1 by an escape sequence.  The most common case is that ISO646's
 509    control character set is designated/invoked to C0 and ISO6429's
 510    control character set is designated/invoked to C1, and usually
 511    these designations/invocations are omitted in a coded text.  With
 512    7-bit environment, only C0 can be used, and a control character for
 513    C1 is encoded by an appropriate escape sequence to fit in the
 514    environment.  All control characters for C1 are defined the
 515    corresponding escape sequences.
 516
 517    A graphic character set is at first designated to one of four
 518    graphic registers (G0 through G3), then these graphic registers are
 519    invoked to GL or GR.  These designations and invocations can be
 520    done independently.  The most common case is that G0 is invoked to
 521    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 522    these invocations and designations are omitted in a coded text.
 523    With 7-bit environment, only GL can be used.
 524
 525    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 526    and 0x7F of GL area work as control characters SPACE and DEL
 527    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 528
 529    There are two ways of invocation: locking-shift and single-shift.
 530    With locking-shift, the invocation lasts until the next different
 531    invocation, whereas with single-shift, the invocation works only
 532    for the following character and doesn't affect locking-shift.
 533    Invocations are done by the following control characters or escape
 534    sequences.
 535
 536    ----------------------------------------------------------------------
 537    function             control char    escape sequence description
 538    ----------------------------------------------------------------------
 539    SI  (shift-in)               0x0F    none            invoke G0 to GL
 540    SO  (shift-out)              0x0E    none            invoke G1 to GL
 541    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 542    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 543    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 544    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 545    ----------------------------------------------------------------------
 546    The first four are for locking-shift.  Control characters for these
 547    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 548
 549    Designations are done by the following escape sequences.
 550    ----------------------------------------------------------------------
 551    escape sequence      description
 552    ----------------------------------------------------------------------
 553    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 554    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 555    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 556    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 557    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 558    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 559    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 560    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 561    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 562    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 563    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 564    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 565    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 566    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 567    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 568    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 569    ----------------------------------------------------------------------
 570
 571    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 572    of dimension 1, chars 94, and final character <F>, and etc.
 573
 574    Note (*): Although these designations are not allowed in ISO2022,
 575    Emacs accepts them on decoding, and produces them on encoding
 576    CHARS96 character set in a coding system which is characterized as
 577    7-bit environment, non-locking-shift, and non-single-shift.
 578
 579    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 580    '(' can be omitted.  We call this as "short-form" here after.
 581
 582    Now you may notice that there are a lot of ways for encoding the
 583    same multilingual text in ISO2022.  Actually, there exists many
 584    coding systems such as Compound Text (used in X's inter client
 585    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 586    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 587    localized platforms), and all of these are variants of ISO2022.
 588
 589    In addition to the above, Emacs handles two more kinds of escape
 590    sequences: ISO6429's direction specification and Emacs' private
 591    sequence for specifying character composition.
 592
 593    ISO6429's direction specification takes the following format:
 594         o CSI ']'      -- end of the current direction
 595         o CSI '0' ']'  -- end of the current direction
 596         o CSI '1' ']'  -- start of left-to-right text
 597         o CSI '2' ']'  -- start of right-to-left text
 598    The control character CSI (0x9B: control sequence introducer) is
 599    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 600
 601    Character composition specification takes the following format:
 602         o ESC '0' -- start character composition
 603         o ESC '1' -- end character composition
 604    Since these are not standard escape sequences of any ISO, the use
 605    of them for these meaning is restricted to Emacs only.  */
 606
 607 enum iso_code_class_type iso_code_class[256];
 608
 609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 610    Check if a text is encoded in ISO2022.  If it is, returns an
 611    integer in which appropriate flag bits any of:
 612         CODING_CATEGORY_MASK_ISO_7
 613         CODING_CATEGORY_MASK_ISO_8_1
 614         CODING_CATEGORY_MASK_ISO_8_2
 615         CODING_CATEGORY_MASK_ISO_7_ELSE
 616         CODING_CATEGORY_MASK_ISO_8_ELSE
 617    are set.  If a code which should never appear in ISO2022 is found,
 618    returns 0.  */
 619
 620 int
 621 detect_coding_iso2022 (src, src_end)
 622      unsigned char *src, *src_end;
 623 {
 624   int mask = (CODING_CATEGORY_MASK_ISO_7
 625               | CODING_CATEGORY_MASK_ISO_8_1
 626               | CODING_CATEGORY_MASK_ISO_8_2
 627               | CODING_CATEGORY_MASK_ISO_7_ELSE
 628               | CODING_CATEGORY_MASK_ISO_8_ELSE
 629               );
 630   int g1 = 0;                   /* 1 iff designating to G1.  */
 631   int c, i;
 632   struct coding_system coding_iso_8_1, coding_iso_8_2;
 633
 634   /* Coding systems of these categories may accept latin extra codes.  */
 635   setup_coding_system
 636     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
 637      &coding_iso_8_1);
 638   setup_coding_system
 639     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
 640      &coding_iso_8_2);
 641
 642   while (mask && src < src_end)
 643     {
 644       c = *src++;
 645       switch (c)
 646         {
 647         case ISO_CODE_ESC:
 648           if (src >= src_end)
 649             break;
 650           c = *src++;
 651           if ((c >= '(' && c <= '/'))
 652             {
 653               /* Designation sequence for a charset of dimension 1.  */
 654               if (src >= src_end)
 655                 break;
 656               c = *src++;
 657               if (c < ' ' || c >= 0x80)
 658                 /* Invalid designation sequence.  */
 659                 return 0;
 660             }
 661           else if (c == '$')
 662             {
 663               /* Designation sequence for a charset of dimension 2.  */
 664               if (src >= src_end)
 665                 break;
 666               c = *src++;
 667               if (c >= '@' && c <= 'B')
 668                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 669                 ;
 670               else if (c >= '(' && c <= '/')
 671                 {
 672                   if (src >= src_end)
 673                     break;
 674                   c = *src++;
 675                   if (c < ' ' || c >= 0x80)
 676                     /* Invalid designation sequence.  */
 677                     return 0;
 678                 }
 679               else
 680                 /* Invalid designation sequence.  */
 681                 return 0;
 682             }
 683           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 684             /* Locking shift.  */
 685             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 686                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 687           else if (c == '0' || c == '1' || c == '2')
 688             /* Start/end composition.  */
 689             ;
 690           else
 691             /* Invalid escape sequence.  */
 692             return 0;
 693           break;
 694
 695         case ISO_CODE_SO:
 696           mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 697                    | CODING_CATEGORY_MASK_ISO_8_ELSE);
 698           break;
 699
 700         case ISO_CODE_CSI:
 701         case ISO_CODE_SS2:
 702         case ISO_CODE_SS3:
 703           {
 704             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 705
 706             if (VECTORP (Vlatin_extra_code_table)
 707                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 708               {
 709                 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 710                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 711                 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 712                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 713               }
 714             mask &= newmask;
 715           }
 716           break;
 717
 718         default:
 719           if (c < 0x80)
 720             break;
 721           else if (c < 0xA0)
 722             {
 723               if (VECTORP (Vlatin_extra_code_table)
 724                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 725                 {
 726                   int newmask = 0;
 727
 728                   if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 729                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 730                   if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 731                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 732                   mask &= newmask;
 733                 }
 734               else
 735                 return 0;
 736             }
 737           else
 738             {
 739               unsigned char *src_begin = src;
 740
 741               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 742                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 743               while (src < src_end && *src >= 0xA0)
 744                 src++;
 745               if ((src - src_begin - 1) & 1 && src < src_end)
 746                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 747             }
 748           break;
 749         }
 750     }
 751
 752   return mask;
 753 }
 754
 755 /* Decode a character of which charset is CHARSET and the 1st position
 756    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 757    fetched from SRC and set to C2.  If CHARSET is negative, it means
 758    that we are decoding ill formed text, and what we can do is just to
 759    read C1 as is.  */
 760
 761 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 762   do {                                                                  \
 763     int c_alt, charset_alt = (charset);                                 \
 764     if (COMPOSING_HEAD_P (coding->composing))                           \
 765       {                                                                 \
 766         *dst++ = LEADING_CODE_COMPOSITION;                              \
 767         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 768           /* To tell composition rules are embeded.  */                 \
 769           *dst++ = 0xFF;                                                \
 770         coding->composing += 2;                                         \
 771       }                                                                 \
 772     if ((charset) >= 0)                                                 \
 773       {                                                                 \
 774         if (CHARSET_DIMENSION (charset) == 2)                           \
 775           ONE_MORE_BYTE (c2);                                           \
 776         if (!NILP (unification_table)                                   \
 777             && ((c_alt = unify_char (unification_table,                 \
 778                                      -1, (charset), c1, c2)) >= 0))     \
 779           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 780       }                                                                 \
 781     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 782       DECODE_CHARACTER_ASCII (c1);                                      \
 783     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 784       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 785     else                                                                \
 786       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 787     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 788       /* To tell a composition rule follows.  */                        \
 789       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 790   } while (0)
 791
 792 /* Set designation state into CODING.  */
 793 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 794   do {                                                                  \
 795     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 796                                      make_number (chars),               \
 797                                      make_number (final_char));         \
 798     if (charset >= 0)                                                   \
 799       {                                                                 \
 800         if (coding->direction == 1                                      \
 801             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 802           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 803         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 804       }                                                                 \
 805   } while (0)
 806
 807 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 808
 809 int
 810 decode_coding_iso2022 (coding, source, destination,
 811                        src_bytes, dst_bytes, consumed)
 812      struct coding_system *coding;
 813      unsigned char *source, *destination;
 814      int src_bytes, dst_bytes;
 815      int *consumed;
 816 {
 817   unsigned char *src = source;
 818   unsigned char *src_end = source + src_bytes;
 819   unsigned char *dst = destination;
 820   unsigned char *dst_end = destination + dst_bytes;
 821   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 822      from DST_END to assure that overflow checking is necessary only
 823      at the head of loop.  */
 824   unsigned char *adjusted_dst_end = dst_end - 6;
 825   int charset;
 826   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 827   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 828   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 829   Lisp_Object unification_table
 830       = coding->character_unification_table_for_decode;
 831
 832   if (!NILP (Venable_character_unification) && NILP (unification_table))
 833     unification_table = Vstandard_character_unification_table_for_decode;
 834
 835   while (src < src_end && dst < adjusted_dst_end)
 836     {
 837       /* SRC_BASE remembers the start position in source in each loop.
 838          The loop will be exited when there's not enough source text
 839          to analyze long escape sequence or 2-byte code (within macros
 840          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 841          to SRC_BASE before exiting.  */
 842       unsigned char *src_base = src;
 843       int c1 = *src++, c2;
 844
 845       switch (iso_code_class [c1])
 846         {
 847         case ISO_0x20_or_0x7F:
 848           if (!coding->composing
 849               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 850             {
 851               /* This is SPACE or DEL.  */
 852               *dst++ = c1;
 853               break;
 854             }
 855           /* This is a graphic character, we fall down ...  */
 856
 857         case ISO_graphic_plane_0:
 858           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 859             {
 860               /* This is a composition rule.  */
 861               *dst++ = c1 | 0x80;
 862               coding->composing = COMPOSING_WITH_RULE_TAIL;
 863             }
 864           else
 865             DECODE_ISO_CHARACTER (charset0, c1);
 866           break;
 867
 868         case ISO_0xA0_or_0xFF:
 869           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 870             {
 871               /* Invalid code.  */
 872               *dst++ = c1;
 873               break;
 874             }
 875           /* This is a graphic character, we fall down ... */
 876
 877         case ISO_graphic_plane_1:
 878           DECODE_ISO_CHARACTER (charset1, c1);
 879           break;
 880
 881         case ISO_control_code:
 882           /* All ISO2022 control characters in this class have the
 883              same representation in Emacs internal format.  */
 884           *dst++ = c1;
 885           break;
 886
 887         case ISO_carriage_return:
 888           if (coding->eol_type == CODING_EOL_CR)
 889             {
 890               *dst++ = '\n';
 891             }
 892           else if (coding->eol_type == CODING_EOL_CRLF)
 893             {
 894               ONE_MORE_BYTE (c1);
 895               if (c1 == ISO_CODE_LF)
 896                 *dst++ = '\n';
 897               else
 898                 {
 899                   src--;
 900                   *dst++ = c1;
 901                 }
 902             }
 903           else
 904             {
 905               *dst++ = c1;
 906             }
 907           break;
 908
 909         case ISO_shift_out:
 910           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 911             goto label_invalid_escape_sequence;
 912           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 913           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 914           break;
 915
 916         case ISO_shift_in:
 917           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 918           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 919           break;
 920
 921         case ISO_single_shift_2_7:
 922         case ISO_single_shift_2:
 923           /* SS2 is handled as an escape sequence of ESC 'N' */
 924           c1 = 'N';
 925           goto label_escape_sequence;
 926
 927         case ISO_single_shift_3:
 928           /* SS2 is handled as an escape sequence of ESC 'O' */
 929           c1 = 'O';
 930           goto label_escape_sequence;
 931
 932         case ISO_control_sequence_introducer:
 933           /* CSI is handled as an escape sequence of ESC '[' ...  */
 934           c1 = '[';
 935           goto label_escape_sequence;
 936
 937         case ISO_escape:
 938           ONE_MORE_BYTE (c1);
 939         label_escape_sequence:
 940           /* Escape sequences handled by Emacs are invocation,
 941              designation, direction specification, and character
 942              composition specification.  */
 943           switch (c1)
 944             {
 945             case '&':           /* revision of following character set */
 946               ONE_MORE_BYTE (c1);
 947               if (!(c1 >= '@' && c1 <= '~'))
 948                 goto label_invalid_escape_sequence;
 949               ONE_MORE_BYTE (c1);
 950               if (c1 != ISO_CODE_ESC)
 951                 goto label_invalid_escape_sequence;
 952               ONE_MORE_BYTE (c1);
 953               goto label_escape_sequence;
 954
 955             case '$':           /* designation of 2-byte character set */
 956               ONE_MORE_BYTE (c1);
 957               if (c1 >= '@' && c1 <= 'B')
 958                 {       /* designation of JISX0208.1978, GB2312.1980,
 959                                    or JISX0208.1980 */
 960                   DECODE_DESIGNATION (0, 2, 94, c1);
 961                 }
 962               else if (c1 >= 0x28 && c1 <= 0x2B)
 963                 {       /* designation of DIMENSION2_CHARS94 character set */
 964                   ONE_MORE_BYTE (c2);
 965                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 966                 }
 967               else if (c1 >= 0x2C && c1 <= 0x2F)
 968                 {       /* designation of DIMENSION2_CHARS96 character set */
 969                   ONE_MORE_BYTE (c2);
 970                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 971                 }
 972               else
 973                 goto label_invalid_escape_sequence;
 974               break;
 975
 976             case 'n':           /* invocation of locking-shift-2 */
 977               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 978                 goto label_invalid_escape_sequence;
 979               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 980               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 981               break;
 982
 983             case 'o':           /* invocation of locking-shift-3 */
 984               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 985                 goto label_invalid_escape_sequence;
 986               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 987               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 988               break;
 989
 990             case 'N':           /* invocation of single-shift-2 */
 991               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 992                 goto label_invalid_escape_sequence;
 993               ONE_MORE_BYTE (c1);
 994               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 995               DECODE_ISO_CHARACTER (charset, c1);
 996               break;
 997
 998             case 'O':           /* invocation of single-shift-3 */
 999               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1000                 goto label_invalid_escape_sequence;
1001               ONE_MORE_BYTE (c1);
1002               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1003               DECODE_ISO_CHARACTER (charset, c1);
1004               break;
1005
1006             case '0':           /* start composing without embeded rules */
1007               coding->composing = COMPOSING_NO_RULE_HEAD;
1008               break;
1009
1010             case '1':           /* end composing */
1011               coding->composing = COMPOSING_NO;
1012               break;
1013
1014             case '2':           /* start composing with embeded rules */
1015               coding->composing = COMPOSING_WITH_RULE_HEAD;
1016               break;
1017
1018             case '[':           /* specification of direction */
1019               /* For the moment, nested direction is not supported.
1020                  So, the value of `coding->direction' is 0 or 1: 0
1021                  means left-to-right, 1 means right-to-left.  */
1022               ONE_MORE_BYTE (c1);
1023               switch (c1)
1024                 {
1025                 case ']':       /* end of the current direction */
1026                   coding->direction = 0;
1027
1028                 case '0':       /* end of the current direction */
1029                 case '1':       /* start of left-to-right direction */
1030                   ONE_MORE_BYTE (c1);
1031                   if (c1 == ']')
1032                     coding->direction = 0;
1033                   else
1034                     goto label_invalid_escape_sequence;
1035                   break;
1036
1037                 case '2':       /* start of right-to-left direction */
1038                   ONE_MORE_BYTE (c1);
1039                   if (c1 == ']')
1040                     coding->direction= 1;
1041                   else
1042                     goto label_invalid_escape_sequence;
1043                   break;
1044
1045                 default:
1046                   goto label_invalid_escape_sequence;
1047                 }
1048               break;
1049
1050             default:
1051               if (c1 >= 0x28 && c1 <= 0x2B)
1052                 {       /* designation of DIMENSION1_CHARS94 character set */
1053                   ONE_MORE_BYTE (c2);
1054                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1055                 }
1056               else if (c1 >= 0x2C && c1 <= 0x2F)
1057                 {       /* designation of DIMENSION1_CHARS96 character set */
1058                   ONE_MORE_BYTE (c2);
1059                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1060                 }
1061               else
1062                 {
1063                   goto label_invalid_escape_sequence;
1064                 }
1065             }
1066           /* We must update these variables now.  */
1067           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1068           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1069           break;
1070
1071         label_invalid_escape_sequence:
1072           {
1073             int length = src - src_base;
1074
1075             bcopy (src_base, dst, length);
1076             dst += length;
1077           }
1078         }
1079       continue;
1080
1081     label_end_of_loop:
1082       coding->carryover_size = src - src_base;
1083       bcopy (src_base, coding->carryover, coding->carryover_size);
1084       src = src_base;
1085       break;
1086     }
1087
1088   /* If this is the last block of the text to be decoded, we had
1089      better just flush out all remaining codes in the text although
1090      they are not valid characters.  */
1091   if (coding->last_block)
1092     {
1093       bcopy (src, dst, src_end - src);
1094       dst += (src_end - src);
1095       src = src_end;
1096     }
1097   *consumed = src - source;
1098   return dst - destination;
1099 }
1100
1101 /* ISO2022 encoding stuff.  */
1102
1103 /*
1104    It is not enough to say just "ISO2022" on encoding, we have to
1105    specify more details.  In Emacs, each coding-system of ISO2022
1106    variant has the following specifications:
1107         1. Initial designation to G0 thru G3.
1108         2. Allows short-form designation?
1109         3. ASCII should be designated to G0 before control characters?
1110         4. ASCII should be designated to G0 at end of line?
1111         5. 7-bit environment or 8-bit environment?
1112         6. Use locking-shift?
1113         7. Use Single-shift?
1114    And the following two are only for Japanese:
1115         8. Use ASCII in place of JIS0201-1976-Roman?
1116         9. Use JISX0208-1983 in place of JISX0208-1978?
1117    These specifications are encoded in `coding->flags' as flag bits
1118    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1119    details.
1120 */
1121
1122 /* Produce codes (escape sequence) for designating CHARSET to graphic
1123    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1124    the coding system CODING allows, produce designation sequence of
1125    short-form.  */
1126
1127 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1128   do {                                                                  \
1129     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1130     char *intermediate_char_94 = "()*+";                                \
1131     char *intermediate_char_96 = ",-./";                                \
1132     Lisp_Object temp                                                    \
1133       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1134     if (! NILP (temp))                                                  \
1135         {                                                               \
1136         *dst++ = ISO_CODE_ESC;                                          \
1137         *dst++ = '&';                                                   \
1138         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1139       }                                                                 \
1140     *dst++ = ISO_CODE_ESC;                                              \
1141     if (CHARSET_DIMENSION (charset) == 1)                               \
1142       {                                                                 \
1143         if (CHARSET_CHARS (charset) == 94)                              \
1144           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1145         else                                                            \
1146           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1147       }                                                                 \
1148     else                                                                \
1149       {                                                                 \
1150         *dst++ = '$';                                                   \
1151         if (CHARSET_CHARS (charset) == 94)                              \
1152           {                                                             \
1153             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1154                 || reg != 0                                             \
1155                 || final_char < '@' || final_char > 'B')                \
1156               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1157           }                                                             \
1158         else                                                            \
1159           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1160       }                                                                 \
1161     *dst++ = final_char;                                                \
1162     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1163   } while (0)
1164
1165 /* The following two macros produce codes (control character or escape
1166    sequence) for ISO2022 single-shift functions (single-shift-2 and
1167    single-shift-3).  */
1168
1169 #define ENCODE_SINGLE_SHIFT_2                           \
1170   do {                                                  \
1171     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1172       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1173     else                                                \
1174       *dst++ = ISO_CODE_SS2;                            \
1175     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1176   } while (0)
1177
1178 #define ENCODE_SINGLE_SHIFT_3                           \
1179   do {                                                  \
1180     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1181       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1182     else                                                \
1183       *dst++ = ISO_CODE_SS3;                            \
1184     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1185   } while (0)
1186
1187 /* The following four macros produce codes (control character or
1188    escape sequence) for ISO2022 locking-shift functions (shift-in,
1189    shift-out, locking-shift-2, and locking-shift-3).  */
1190
1191 #define ENCODE_SHIFT_IN                         \
1192   do {                                          \
1193     *dst++ = ISO_CODE_SI;                       \
1194     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1195   } while (0)
1196
1197 #define ENCODE_SHIFT_OUT                        \
1198   do {                                          \
1199     *dst++ = ISO_CODE_SO;                       \
1200     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1201   } while (0)
1202
1203 #define ENCODE_LOCKING_SHIFT_2                  \
1204   do {                                          \
1205     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1206     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1207   } while (0)
1208
1209 #define ENCODE_LOCKING_SHIFT_3                  \
1210   do {                                          \
1211     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1212     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1213   } while (0)
1214
1215 /* Produce codes for a DIMENSION1 character whose character set is
1216    CHARSET and whose position-code is C1.  Designation and invocation
1217    sequences are also produced in advance if necessary.  */
1218
1219
1220 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1221   do {                                                                  \
1222     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1223       {                                                                 \
1224         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1225           *dst++ = c1 & 0x7F;                                           \
1226         else                                                            \
1227           *dst++ = c1 | 0x80;                                           \
1228         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1229         break;                                                          \
1230       }                                                                 \
1231     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1232       {                                                                 \
1233         *dst++ = c1 & 0x7F;                                             \
1234         break;                                                          \
1235       }                                                                 \
1236     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1237       {                                                                 \
1238         *dst++ = c1 | 0x80;                                             \
1239         break;                                                          \
1240       }                                                                 \
1241     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1242              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1243       {                                                                 \
1244         /* We should not encode this character, instead produce one or  \
1245            two `?'s.  */                                                \
1246         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1247         if (CHARSET_WIDTH (charset) == 2)                               \
1248           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1249         break;                                                          \
1250       }                                                                 \
1251     else                                                                \
1252       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1253          must invoke it, or, at first, designate it to some graphic     \
1254          register.  Then repeat the loop to actually produce the        \
1255          character.  */                                                 \
1256       dst = encode_invocation_designation (charset, coding, dst);       \
1257   } while (1)
1258
1259 /* Produce codes for a DIMENSION2 character whose character set is
1260    CHARSET and whose position-codes are C1 and C2.  Designation and
1261    invocation codes are also produced in advance if necessary.  */
1262
1263 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1264   do {                                                                  \
1265     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1266       {                                                                 \
1267         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1268           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1269         else                                                            \
1270           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1271         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1272         break;                                                          \
1273       }                                                                 \
1274     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1275       {                                                                 \
1276         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1277         break;                                                          \
1278       }                                                                 \
1279     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1280       {                                                                 \
1281         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1282         break;                                                          \
1283       }                                                                 \
1284     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1285              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1286       {                                                                 \
1287         /* We should not encode this character, instead produce one or  \
1288            two `?'s.  */                                                \
1289         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1290         if (CHARSET_WIDTH (charset) == 2)                               \
1291           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1292         break;                                                          \
1293       }                                                                 \
1294     else                                                                \
1295       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1296          must invoke it, or, at first, designate it to some graphic     \
1297          register.  Then repeat the loop to actually produce the        \
1298          character.  */                                                 \
1299       dst = encode_invocation_designation (charset, coding, dst);       \
1300   } while (1)
1301
1302 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1303   do {                                                                    \
1304     int c_alt, charset_alt;                                               \
1305     if (!NILP (unification_table)                                         \
1306         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1307             >= 0))                                                        \
1308       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1309     else                                                                  \
1310       charset_alt = charset;                                              \
1311     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1312       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1313     else                                                                  \
1314       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1315   } while (0)
1316
1317 /* Produce designation and invocation codes at a place pointed by DST
1318    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1319    Return new DST.  */
1320
1321 unsigned char *
1322 encode_invocation_designation (charset, coding, dst)
1323      int charset;
1324      struct coding_system *coding;
1325      unsigned char *dst;
1326 {
1327   int reg;                      /* graphic register number */
1328
1329   /* At first, check designations.  */
1330   for (reg = 0; reg < 4; reg++)
1331     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1332       break;
1333
1334   if (reg >= 4)
1335     {
1336       /* CHARSET is not yet designated to any graphic registers.  */
1337       /* At first check the requested designation.  */
1338       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1339       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1340         /* Since CHARSET requests no special designation, designate it
1341            to graphic register 0.  */
1342         reg = 0;
1343
1344       ENCODE_DESIGNATION (charset, reg, coding);
1345     }
1346
1347   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1348       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1349     {
1350       /* Since the graphic register REG is not invoked to any graphic
1351          planes, invoke it to graphic plane 0.  */
1352       switch (reg)
1353         {
1354         case 0:                 /* graphic register 0 */
1355           ENCODE_SHIFT_IN;
1356           break;
1357
1358         case 1:                 /* graphic register 1 */
1359           ENCODE_SHIFT_OUT;
1360           break;
1361
1362         case 2:                 /* graphic register 2 */
1363           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1364             ENCODE_SINGLE_SHIFT_2;
1365           else
1366             ENCODE_LOCKING_SHIFT_2;
1367           break;
1368
1369         case 3:                 /* graphic register 3 */
1370           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1371             ENCODE_SINGLE_SHIFT_3;
1372           else
1373             ENCODE_LOCKING_SHIFT_3;
1374           break;
1375         }
1376     }
1377   return dst;
1378 }
1379
1380 /* The following two macros produce codes for indicating composition.  */
1381 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1382 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1383 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1384
1385 /* The following three macros produce codes for indicating direction
1386    of text.  */
1387 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1388   do {                                                  \
1389     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1390       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1391     else                                                \
1392       *dst++ = ISO_CODE_CSI;                            \
1393   } while (0)
1394
1395 #define ENCODE_DIRECTION_R2L    \
1396   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1397
1398 #define ENCODE_DIRECTION_L2R    \
1399   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1400
1401 /* Produce codes for designation and invocation to reset the graphic
1402    planes and registers to initial state.  */
1403 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1404   do {                                                                      \
1405     int reg;                                                                \
1406     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1407       ENCODE_SHIFT_IN;                                                      \
1408     for (reg = 0; reg < 4; reg++)                                           \
1409       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1410           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1411               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1412         ENCODE_DESIGNATION                                                  \
1413           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1414   } while (0)
1415
1416 /* Produce designation sequences of charsets in the line started from
1417    *SRC to a place pointed by DSTP.
1418
1419    If the current block ends before any end-of-line, we may fail to
1420    find all the necessary *designations.  */
1421 encode_designation_at_bol (coding, table, src, src_end, dstp)
1422      struct coding_system *coding;
1423      Lisp_Object table;
1424      unsigned char *src, *src_end, **dstp;
1425 {
1426   int charset, c, found = 0, reg;
1427   /* Table of charsets to be designated to each graphic register.  */
1428   int r[4];
1429   unsigned char *dst = *dstp;
1430
1431   for (reg = 0; reg < 4; reg++)
1432     r[reg] = -1;
1433
1434   while (src < src_end && *src != '\n' && found < 4)
1435     {
1436       int bytes = BYTES_BY_CHAR_HEAD (*src);
1437
1438       if (NILP (table))
1439         charset = CHARSET_AT (src);
1440       else
1441         {
1442           int c_alt, c1, c2;
1443
1444           SPLIT_STRING(src, bytes, charset, c1, c2);
1445           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1446             charset = CHAR_CHARSET (c_alt);
1447         }
1448
1449       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1450       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1451         {
1452           found++;
1453           r[reg] = charset;
1454         }
1455
1456       src += bytes;
1457     }
1458
1459   if (found)
1460     {
1461       for (reg = 0; reg < 4; reg++)
1462         if (r[reg] >= 0
1463             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1464           ENCODE_DESIGNATION (r[reg], reg, coding);
1465       *dstp = dst;
1466     }
1467 }
1468
1469 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1470
1471 int
1472 encode_coding_iso2022 (coding, source, destination,
1473                        src_bytes, dst_bytes, consumed)
1474      struct coding_system *coding;
1475      unsigned char *source, *destination;
1476      int src_bytes, dst_bytes;
1477      int *consumed;
1478 {
1479   unsigned char *src = source;
1480   unsigned char *src_end = source + src_bytes;
1481   unsigned char *dst = destination;
1482   unsigned char *dst_end = destination + dst_bytes;
1483   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1484      from DST_END to assure overflow checking is necessary only at the
1485      head of loop.  */
1486   unsigned char *adjusted_dst_end = dst_end - 19;
1487   Lisp_Object unification_table
1488       = coding->character_unification_table_for_encode;
1489
1490   if (!NILP (Venable_character_unification) && NILP (unification_table))
1491     unification_table = Vstandard_character_unification_table_for_encode;
1492
1493   while (src < src_end && dst < adjusted_dst_end)
1494     {
1495       /* SRC_BASE remembers the start position in source in each loop.
1496          The loop will be exited when there's not enough source text
1497          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1498          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1499          reset to SRC_BASE before exiting.  */
1500       unsigned char *src_base = src;
1501       int charset, c1, c2, c3, c4;
1502
1503       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1504           && CODING_SPEC_ISO_BOL (coding))
1505         {
1506           /* We have to produce designation sequences if any now.  */
1507           encode_designation_at_bol (coding, unification_table,
1508                                      src, src_end, &dst);
1509           CODING_SPEC_ISO_BOL (coding) = 0;
1510         }
1511
1512       c1 = *src++;
1513       /* If we are seeing a component of a composite character, we are
1514          seeing a leading-code specially encoded for composition, or a
1515          composition rule if composing with rule.  We must set C1
1516          to a normal leading-code or an ASCII code.  If we are not at
1517          a composed character, we must reset the composition state.  */
1518       if (COMPOSING_P (coding->composing))
1519         {
1520           if (c1 < 0xA0)
1521             {
1522               /* We are not in a composite character any longer.  */
1523               coding->composing = COMPOSING_NO;
1524               ENCODE_COMPOSITION_END;
1525             }
1526           else
1527             {
1528               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1529                 {
1530                   *dst++ = c1 & 0x7F;
1531                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1532                   continue;
1533                 }
1534               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1535                 coding->composing = COMPOSING_WITH_RULE_RULE;
1536               if (c1 == 0xA0)
1537                 {
1538                   /* This is an ASCII component.  */
1539                   ONE_MORE_BYTE (c1);
1540                   c1 &= 0x7F;
1541                 }
1542               else
1543                 /* This is a leading-code of non ASCII component.  */
1544                 c1 -= 0x20;
1545             }
1546         }
1547
1548       /* Now encode one character.  C1 is a control character, an
1549          ASCII character, or a leading-code of multi-byte character.  */
1550       switch (emacs_code_class[c1])
1551         {
1552         case EMACS_ascii_code:
1553           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1554           break;
1555
1556         case EMACS_control_code:
1557           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1558             ENCODE_RESET_PLANE_AND_REGISTER;
1559           *dst++ = c1;
1560           break;
1561
1562         case EMACS_carriage_return_code:
1563           if (!coding->selective)
1564             {
1565               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1566                 ENCODE_RESET_PLANE_AND_REGISTER;
1567               *dst++ = c1;
1568               break;
1569             }
1570           /* fall down to treat '\r' as '\n' ...  */
1571
1572         case EMACS_linefeed_code:
1573           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1574             ENCODE_RESET_PLANE_AND_REGISTER;
1575           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1576             bcopy (coding->spec.iso2022.initial_designation,
1577                    coding->spec.iso2022.current_designation,
1578                    sizeof coding->spec.iso2022.initial_designation);
1579           if (coding->eol_type == CODING_EOL_LF
1580               || coding->eol_type == CODING_EOL_UNDECIDED)
1581             *dst++ = ISO_CODE_LF;
1582           else if (coding->eol_type == CODING_EOL_CRLF)
1583             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1584           else
1585             *dst++ = ISO_CODE_CR;
1586           CODING_SPEC_ISO_BOL (coding) = 1;
1587           break;
1588
1589         case EMACS_leading_code_2:
1590           ONE_MORE_BYTE (c2);
1591           if (c2 < 0xA0)
1592             {
1593               /* invalid sequence */
1594               *dst++ = c1;
1595               *dst++ = c2;
1596             }
1597           else
1598             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1599           break;
1600
1601         case EMACS_leading_code_3:
1602           TWO_MORE_BYTES (c2, c3);
1603           if (c2 < 0xA0 || c3 < 0xA0)
1604             {
1605               /* invalid sequence */
1606               *dst++ = c1;
1607               *dst++ = c2;
1608               *dst++ = c3;
1609             }
1610           else if (c1 < LEADING_CODE_PRIVATE_11)
1611             ENCODE_ISO_CHARACTER (c1, c2, c3);
1612           else
1613             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1614           break;
1615
1616         case EMACS_leading_code_4:
1617           THREE_MORE_BYTES (c2, c3, c4);
1618           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1619             {
1620               /* invalid sequence */
1621               *dst++ = c1;
1622               *dst++ = c2;
1623               *dst++ = c3;
1624               *dst++ = c4;
1625             }
1626           else
1627             ENCODE_ISO_CHARACTER (c2, c3, c4);
1628           break;
1629
1630         case EMACS_leading_code_composition:
1631           ONE_MORE_BYTE (c2);
1632           if (c2 < 0xA0)
1633             {
1634               /* invalid sequence */
1635               *dst++ = c1;
1636               *dst++ = c2;
1637             }
1638           else if (c2 == 0xFF)
1639             {
1640               coding->composing = COMPOSING_WITH_RULE_HEAD;
1641               ENCODE_COMPOSITION_WITH_RULE_START;
1642             }
1643           else
1644             {
1645               /* Rewind one byte because it is a character code of
1646                  composition elements.  */
1647               src--;
1648               coding->composing = COMPOSING_NO_RULE_HEAD;
1649               ENCODE_COMPOSITION_NO_RULE_START;
1650             }
1651           break;
1652
1653         case EMACS_invalid_code:
1654           *dst++ = c1;
1655           break;
1656         }
1657       continue;
1658     label_end_of_loop:
1659       /* We reach here because the source date ends not at character
1660          boundary.  */
1661       coding->carryover_size = src_end - src_base;
1662       bcopy (src_base, coding->carryover, coding->carryover_size);
1663       src = src_end;
1664       break;
1665     }
1666
1667   /* If this is the last block of the text to be encoded, we must
1668      reset graphic planes and registers to the initial state.  */
1669   if (src >= src_end && coding->last_block)
1670     {
1671       ENCODE_RESET_PLANE_AND_REGISTER;
1672       if (coding->carryover_size > 0
1673           && coding->carryover_size < (dst_end - dst))
1674         {
1675           bcopy (coding->carryover, dst, coding->carryover_size);
1676           dst += coding->carryover_size;
1677           coding->carryover_size = 0;
1678         }
1679     }
1680   *consumed = src - source;
1681   return dst - destination;
1682 }
1683
1684 \f
1685 /*** 4. SJIS and BIG5 handlers ***/
1686
1687 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1688    quite widely.  So, for the moment, Emacs supports them in the bare
1689    C code.  But, in the future, they may be supported only by CCL.  */
1690
1691 /* SJIS is a coding system encoding three character sets: ASCII, right
1692    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1693    as is.  A character of charset katakana-jisx0201 is encoded by
1694    "position-code + 0x80".  A character of charset japanese-jisx0208
1695    is encoded in 2-byte but two position-codes are divided and shifted
1696    so that it fit in the range below.
1697
1698    --- CODE RANGE of SJIS ---
1699    (character set)      (range)
1700    ASCII                0x00 .. 0x7F
1701    KATAKANA-JISX0201    0xA0 .. 0xDF
1702    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1703             (2nd byte)  0x40 .. 0xFF
1704    -------------------------------
1705
1706 */
1707
1708 /* BIG5 is a coding system encoding two character sets: ASCII and
1709    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1710    character set and is encoded in two-byte.
1711
1712    --- CODE RANGE of BIG5 ---
1713    (character set)      (range)
1714    ASCII                0x00 .. 0x7F
1715    Big5 (1st byte)      0xA1 .. 0xFE
1716         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1717    --------------------------
1718
1719    Since the number of characters in Big5 is larger than maximum
1720    characters in Emacs' charset (96x96), it can't be handled as one
1721    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1722    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1723    contains frequently used characters and the latter contains less
1724    frequently used characters.  */
1725
1726 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1727    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1728    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1729    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1730
1731 /* Number of Big5 characters which have the same code in 1st byte.  */
1732 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1733
1734 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1735   do {                                                                  \
1736     unsigned int temp                                                   \
1737       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1738     if (b1 < 0xC9)                                                      \
1739       charset = charset_big5_1;                                         \
1740     else                                                                \
1741       {                                                                 \
1742         charset = charset_big5_2;                                       \
1743         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1744       }                                                                 \
1745     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1746     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1747   } while (0)
1748
1749 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1750   do {                                                                  \
1751     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1752     if (charset == charset_big5_2)                                      \
1753       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1754     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1755     b2 = temp % BIG5_SAME_ROW;                                          \
1756     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1757   } while (0)
1758
1759 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1760   do {                                                                  \
1761     int c_alt, charset_alt = (charset);                                 \
1762     if (!NILP (unification_table)                                       \
1763         && ((c_alt = unify_char (unification_table,                     \
1764                                  -1, (charset), c1, c2)) >= 0))         \
1765           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1766     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1767       DECODE_CHARACTER_ASCII (c1);                                      \
1768     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1769       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1770     else                                                                \
1771       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1772   } while (0)
1773
1774 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1775   do {                                                                    \
1776     int c_alt, charset_alt;                                               \
1777     if (!NILP (unification_table)                                         \
1778         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1779             >= 0))                                                        \
1780       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1781     else                                                                  \
1782       charset_alt = charset;                                              \
1783     if (charset_alt == charset_ascii)                                     \
1784       *dst++ = c1;                                                        \
1785     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1786       {                                                                   \
1787         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1788           *dst++ = c1;                                                    \
1789         else                                                              \
1790           *dst++ = charset_alt, *dst++ = c1;                              \
1791       }                                                                   \
1792     else                                                                  \
1793       {                                                                   \
1794         c1 &= 0x7F, c2 &= 0x7F;                                           \
1795         if (sjis_p && charset_alt == charset_jisx0208)                    \
1796           {                                                               \
1797             unsigned char s1, s2;                                         \
1798                                                                           \
1799             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1800             *dst++ = s1, *dst++ = s2;                                     \
1801           }                                                               \
1802         else if (!sjis_p                                                  \
1803                  && (charset_alt == charset_big5_1                        \
1804                      || charset_alt == charset_big5_2))                   \
1805           {                                                               \
1806             unsigned char b1, b2;                                         \
1807                                                                           \
1808             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1809             *dst++ = b1, *dst++ = b2;                                     \
1810           }                                                               \
1811         else                                                              \
1812           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1813       }                                                                   \
1814   } while (0);
1815
1816 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1817    Check if a text is encoded in SJIS.  If it is, return
1818    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1819
1820 int
1821 detect_coding_sjis (src, src_end)
1822      unsigned char *src, *src_end;
1823 {
1824   unsigned char c;
1825
1826   while (src < src_end)
1827     {
1828       c = *src++;
1829       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1830         return 0;
1831       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1832         {
1833           if (src < src_end && *src++ < 0x40)
1834             return 0;
1835         }
1836     }
1837   return CODING_CATEGORY_MASK_SJIS;
1838 }
1839
1840 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1841    Check if a text is encoded in BIG5.  If it is, return
1842    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1843
1844 int
1845 detect_coding_big5 (src, src_end)
1846      unsigned char *src, *src_end;
1847 {
1848   unsigned char c;
1849
1850   while (src < src_end)
1851     {
1852       c = *src++;
1853       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1854         return 0;
1855       if (c >= 0xA1)
1856         {
1857           if (src >= src_end)
1858             break;
1859           c = *src++;
1860           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1861             return 0;
1862         }
1863     }
1864   return CODING_CATEGORY_MASK_BIG5;
1865 }
1866
1867 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1868    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1869
1870 int
1871 decode_coding_sjis_big5 (coding, source, destination,
1872                          src_bytes, dst_bytes, consumed, sjis_p)
1873      struct coding_system *coding;
1874      unsigned char *source, *destination;
1875      int src_bytes, dst_bytes;
1876      int *consumed;
1877      int sjis_p;
1878 {
1879   unsigned char *src = source;
1880   unsigned char *src_end = source + src_bytes;
1881   unsigned char *dst = destination;
1882   unsigned char *dst_end = destination + dst_bytes;
1883   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1884      from DST_END to assure overflow checking is necessary only at the
1885      head of loop.  */
1886   unsigned char *adjusted_dst_end = dst_end - 3;
1887   Lisp_Object unification_table
1888       = coding->character_unification_table_for_decode;
1889
1890   if (!NILP (Venable_character_unification) && NILP (unification_table))
1891     unification_table = Vstandard_character_unification_table_for_decode;
1892
1893   while (src < src_end && dst < adjusted_dst_end)
1894     {
1895       /* SRC_BASE remembers the start position in source in each loop.
1896          The loop will be exited when there's not enough source text
1897          to analyze two-byte character (within macro ONE_MORE_BYTE).
1898          In that case, SRC is reset to SRC_BASE before exiting.  */
1899       unsigned char *src_base = src;
1900       unsigned char c1 = *src++, c2, c3, c4;
1901
1902       if (c1 == '\r')
1903         {
1904           if (coding->eol_type == CODING_EOL_CRLF)
1905             {
1906               ONE_MORE_BYTE (c2);
1907               if (c2 == '\n')
1908                 *dst++ = c2;
1909               else
1910                 /* To process C2 again, SRC is subtracted by 1.  */
1911                 *dst++ = c1, src--;
1912             }
1913           else
1914             *dst++ = c1;
1915         }
1916       else if (c1 < 0x20)
1917         *dst++ = c1;
1918       else if (c1 < 0x80)
1919         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1920       else if (c1 < 0xA0 || c1 >= 0xE0)
1921         {
1922           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1923           if (sjis_p)
1924             {
1925               ONE_MORE_BYTE (c2);
1926               DECODE_SJIS (c1, c2, c3, c4);
1927               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1928             }
1929           else if (c1 >= 0xE0 && c1 < 0xFF)
1930             {
1931               int charset;
1932
1933               ONE_MORE_BYTE (c2);
1934               DECODE_BIG5 (c1, c2, charset, c3, c4);
1935               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1936             }
1937           else                  /* Invalid code */
1938             *dst++ = c1;
1939         }
1940       else
1941         {
1942           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1943           if (sjis_p)
1944             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1945           else
1946             {
1947               int charset;
1948
1949               ONE_MORE_BYTE (c2);
1950               DECODE_BIG5 (c1, c2, charset, c3, c4);
1951               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1952             }
1953         }
1954       continue;
1955
1956     label_end_of_loop:
1957       coding->carryover_size = src - src_base;
1958       bcopy (src_base, coding->carryover, coding->carryover_size);
1959       src = src_base;
1960       break;
1961     }
1962
1963   *consumed = src - source;
1964   return dst - destination;
1965 }
1966
1967 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1968    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1969    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1970    sure that all these charsets are registered as official charset
1971    (i.e. do not have extended leading-codes).  Characters of other
1972    charsets are produced without any encoding.  If SJIS_P is 1, encode
1973    SJIS text, else encode BIG5 text.  */
1974
1975 int
1976 encode_coding_sjis_big5 (coding, source, destination,
1977                          src_bytes, dst_bytes, consumed, sjis_p)
1978      struct coding_system *coding;
1979      unsigned char *source, *destination;
1980      int src_bytes, dst_bytes;
1981      int *consumed;
1982      int sjis_p;
1983 {
1984   unsigned char *src = source;
1985   unsigned char *src_end = source + src_bytes;
1986   unsigned char *dst = destination;
1987   unsigned char *dst_end = destination + dst_bytes;
1988   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1989      from DST_END to assure overflow checking is necessary only at the
1990      head of loop.  */
1991   unsigned char *adjusted_dst_end = dst_end - 1;
1992   Lisp_Object unification_table
1993       = coding->character_unification_table_for_encode;
1994
1995   if (!NILP (Venable_character_unification) && NILP (unification_table))
1996     unification_table = Vstandard_character_unification_table_for_encode;
1997
1998   while (src < src_end && dst < adjusted_dst_end)
1999     {
2000       /* SRC_BASE remembers the start position in source in each loop.
2001          The loop will be exited when there's not enough source text
2002          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2003          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2004          before exiting.  */
2005       unsigned char *src_base = src;
2006       unsigned char c1 = *src++, c2, c3, c4;
2007
2008       if (coding->composing)
2009         {
2010           if (c1 == 0xA0)
2011             {
2012               ONE_MORE_BYTE (c1);
2013               c1 &= 0x7F;
2014             }
2015           else if (c1 >= 0xA0)
2016             c1 -= 0x20;
2017           else
2018             coding->composing = 0;
2019         }
2020
2021       switch (emacs_code_class[c1])
2022         {
2023         case EMACS_ascii_code:
2024           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2025           break;
2026
2027         case EMACS_control_code:
2028           *dst++ = c1;
2029           break;
2030
2031         case EMACS_carriage_return_code:
2032           if (!coding->selective)
2033             {
2034               *dst++ = c1;
2035               break;
2036             }
2037           /* fall down to treat '\r' as '\n' ...  */
2038
2039         case EMACS_linefeed_code:
2040           if (coding->eol_type == CODING_EOL_LF
2041               || coding->eol_type == CODING_EOL_UNDECIDED)
2042             *dst++ = '\n';
2043           else if (coding->eol_type == CODING_EOL_CRLF)
2044             *dst++ = '\r', *dst++ = '\n';
2045           else
2046             *dst++ = '\r';
2047           break;
2048
2049         case EMACS_leading_code_2:
2050           ONE_MORE_BYTE (c2);
2051           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2052           break;
2053
2054         case EMACS_leading_code_3:
2055           TWO_MORE_BYTES (c2, c3);
2056           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2057           break;
2058
2059         case EMACS_leading_code_4:
2060           THREE_MORE_BYTES (c2, c3, c4);
2061           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2062           break;
2063
2064         case EMACS_leading_code_composition:
2065           coding->composing = 1;
2066           break;
2067
2068         default:                /* i.e. case EMACS_invalid_code: */
2069           *dst++ = c1;
2070         }
2071       continue;
2072
2073     label_end_of_loop:
2074       coding->carryover_size = src_end - src_base;
2075       bcopy (src_base, coding->carryover, coding->carryover_size);
2076       src = src_end;
2077       break;
2078     }
2079
2080   *consumed = src - source;
2081   return dst - destination;
2082 }
2083
2084 \f
2085 /*** 5. End-of-line handlers ***/
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2088    This function is called only when `coding->eol_type' is
2089    CODING_EOL_CRLF or CODING_EOL_CR.  */
2090
2091 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2092      struct coding_system *coding;
2093      unsigned char *source, *destination;
2094      int src_bytes, dst_bytes;
2095      int *consumed;
2096 {
2097   unsigned char *src = source;
2098   unsigned char *src_end = source + src_bytes;
2099   unsigned char *dst = destination;
2100   unsigned char *dst_end = destination + dst_bytes;
2101   int produced;
2102
2103   switch (coding->eol_type)
2104     {
2105     case CODING_EOL_CRLF:
2106       {
2107         /* Since the maximum bytes produced by each loop is 2, we
2108            subtract 1 from DST_END to assure overflow checking is
2109            necessary only at the head of loop.  */
2110         unsigned char *adjusted_dst_end = dst_end - 1;
2111
2112         while (src < src_end && dst < adjusted_dst_end)
2113           {
2114             unsigned char *src_base = src;
2115             unsigned char c = *src++;
2116             if (c == '\r')
2117               {
2118                 ONE_MORE_BYTE (c);
2119                 if (c != '\n')
2120                   *dst++ = '\r';
2121                 *dst++ = c;
2122               }
2123             else
2124               *dst++ = c;
2125             continue;
2126
2127           label_end_of_loop:
2128             coding->carryover_size = src - src_base;
2129             bcopy (src_base, coding->carryover, coding->carryover_size);
2130             src = src_base;
2131             break;
2132           }
2133         *consumed = src - source;
2134         produced = dst - destination;
2135         break;
2136       }
2137
2138     case CODING_EOL_CR:
2139       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2140       bcopy (source, destination, produced);
2141       dst_end = destination + produced;
2142       while (dst < dst_end)
2143         if (*dst++ == '\r') dst[-1] = '\n';
2144       *consumed = produced;
2145       break;
2146
2147     default:                    /* i.e. case: CODING_EOL_LF */
2148       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2149       bcopy (source, destination, produced);
2150       *consumed = produced;
2151       break;
2152     }
2153
2154   return produced;
2155 }
2156
2157 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2158    format of end-of-line according to `coding->eol_type'.  If
2159    `coding->selective' is 1, code '\r' in source text also means
2160    end-of-line.  */
2161
2162 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2163      struct coding_system *coding;
2164      unsigned char *source, *destination;
2165      int src_bytes, dst_bytes;
2166      int *consumed;
2167 {
2168   unsigned char *src = source;
2169   unsigned char *dst = destination;
2170   int produced;
2171
2172   if (src_bytes <= 0)
2173     return 0;
2174
2175   switch (coding->eol_type)
2176     {
2177     case CODING_EOL_LF:
2178     case CODING_EOL_UNDECIDED:
2179       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2180       bcopy (source, destination, produced);
2181       if (coding->selective)
2182         {
2183           int i = produced;
2184           while (i--)
2185             if (*dst++ == '\r') dst[-1] = '\n';
2186         }
2187       *consumed = produced;
2188
2189     case CODING_EOL_CRLF:
2190       {
2191         unsigned char c;
2192         unsigned char *src_end = source + src_bytes;
2193         unsigned char *dst_end = destination + dst_bytes;
2194         /* Since the maximum bytes produced by each loop is 2, we
2195            subtract 1 from DST_END to assure overflow checking is
2196            necessary only at the head of loop.  */
2197         unsigned char *adjusted_dst_end = dst_end - 1;
2198
2199         while (src < src_end && dst < adjusted_dst_end)
2200           {
2201             c = *src++;
2202             if (c == '\n' || (c == '\r' && coding->selective))
2203               *dst++ = '\r', *dst++ = '\n';
2204             else
2205               *dst++ = c;
2206           }
2207         produced = dst - destination;
2208         *consumed = src - source;
2209         break;
2210       }
2211
2212     default:                    /* i.e. case CODING_EOL_CR: */
2213       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2214       bcopy (source, destination, produced);
2215       {
2216         int i = produced;
2217         while (i--)
2218           if (*dst++ == '\n') dst[-1] = '\r';
2219       }
2220       *consumed = produced;
2221     }
2222
2223   return produced;
2224 }
2225
2226 \f
2227 /*** 6. C library functions ***/
2228
2229 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2230    has a property `coding-system'.  The value of this property is a
2231    vector of length 5 (called as coding-vector).  Among elements of
2232    this vector, the first (element[0]) and the fifth (element[4])
2233    carry important information for decoding/encoding.  Before
2234    decoding/encoding, this information should be set in fields of a
2235    structure of type `coding_system'.
2236
2237    A value of property `coding-system' can be a symbol of another
2238    subsidiary coding-system.  In that case, Emacs gets coding-vector
2239    from that symbol.
2240
2241    `element[0]' contains information to be set in `coding->type'.  The
2242    value and its meaning is as follows:
2243
2244    0 -- coding_type_emacs_mule
2245    1 -- coding_type_sjis
2246    2 -- coding_type_iso2022
2247    3 -- coding_type_big5
2248    4 -- coding_type_ccl encoder/decoder written in CCL
2249    nil -- coding_type_no_conversion
2250    t -- coding_type_undecided (automatic conversion on decoding,
2251                                no-conversion on encoding)
2252
2253    `element[4]' contains information to be set in `coding->flags' and
2254    `coding->spec'.  The meaning varies by `coding->type'.
2255
2256    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2257    of length 32 (of which the first 13 sub-elements are used now).
2258    Meanings of these sub-elements are:
2259
2260    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2261         If the value is an integer of valid charset, the charset is
2262         assumed to be designated to graphic register N initially.
2263
2264         If the value is minus, it is a minus value of charset which
2265         reserves graphic register N, which means that the charset is
2266         not designated initially but should be designated to graphic
2267         register N just before encoding a character in that charset.
2268
2269         If the value is nil, graphic register N is never used on
2270         encoding.
2271
2272    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2273         Each value takes t or nil.  See the section ISO2022 of
2274         `coding.h' for more information.
2275
2276    If `coding->type' is `coding_type_big5', element[4] is t to denote
2277    BIG5-ETen or nil to denote BIG5-HKU.
2278
2279    If `coding->type' takes the other value, element[4] is ignored.
2280
2281    Emacs Lisp's coding system also carries information about format of
2282    end-of-line in a value of property `eol-type'.  If the value is
2283    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2284    means CODING_EOL_CR.  If it is not integer, it should be a vector
2285    of subsidiary coding systems of which property `eol-type' has one
2286    of above values.
2287
2288 */
2289
2290 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2291    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2292    is setup so that no conversion is necessary and return -1, else
2293    return 0.  */
2294
2295 int
2296 setup_coding_system (coding_system, coding)
2297      Lisp_Object coding_system;
2298      struct coding_system *coding;
2299 {
2300   Lisp_Object type, eol_type;
2301
2302   /* At first, set several fields to default values.  */
2303   coding->require_flushing = 0;
2304   coding->last_block = 0;
2305   coding->selective = 0;
2306   coding->composing = 0;
2307   coding->direction = 0;
2308   coding->carryover_size = 0;
2309   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2310   coding->character_unification_table_for_decode = Qnil;
2311   coding->character_unification_table_for_encode = Qnil;
2312
2313   Vlast_coding_system_used = coding->symbol = coding_system;
2314   eol_type = Qnil;
2315   /* Get value of property `coding-system' until we get a vector.
2316      While doing that, also get values of properties
2317      `post-read-conversion', `pre-write-conversion',
2318      `character-unification-table-for-decode',
2319      `character-unification-table-for-encode' and `eol-type'.  */
2320   while (!NILP (coding_system) && SYMBOLP (coding_system))
2321     {
2322       if (NILP (coding->post_read_conversion))
2323         coding->post_read_conversion = Fget (coding_system,
2324                                              Qpost_read_conversion);
2325       if (NILP (coding->pre_write_conversion))
2326         coding->pre_write_conversion = Fget (coding_system,
2327                                              Qpre_write_conversion);
2328       if (!inhibit_eol_conversion && NILP (eol_type))
2329         eol_type = Fget (coding_system, Qeol_type);
2330
2331       if (NILP (coding->character_unification_table_for_decode))
2332         coding->character_unification_table_for_decode
2333           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2334
2335       if (NILP (coding->character_unification_table_for_encode))
2336         coding->character_unification_table_for_encode
2337           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2338
2339       coding_system = Fget (coding_system, Qcoding_system);
2340     }
2341
2342   while (!NILP (coding->character_unification_table_for_decode)
2343          && SYMBOLP (coding->character_unification_table_for_decode))
2344         coding->character_unification_table_for_decode
2345           = Fget (coding->character_unification_table_for_decode,
2346                   Qcharacter_unification_table_for_decode);
2347   if (!NILP (coding->character_unification_table_for_decode)
2348       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2349       coding->character_unification_table_for_decode = Qnil;
2350
2351   while (!NILP (coding->character_unification_table_for_encode)
2352          && SYMBOLP (coding->character_unification_table_for_encode))
2353         coding->character_unification_table_for_encode
2354           = Fget (coding->character_unification_table_for_encode,
2355                   Qcharacter_unification_table_for_encode);
2356   if (!NILP (coding->character_unification_table_for_encode)
2357       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2358       coding->character_unification_table_for_encode = Qnil;
2359
2360   if (!VECTORP (coding_system)
2361       || XVECTOR (coding_system)->size != 5)
2362     goto label_invalid_coding_system;
2363
2364   if (VECTORP (eol_type))
2365     coding->eol_type = CODING_EOL_UNDECIDED;
2366   else if (XFASTINT (eol_type) == 1)
2367     coding->eol_type = CODING_EOL_CRLF;
2368   else if (XFASTINT (eol_type) == 2)
2369     coding->eol_type = CODING_EOL_CR;
2370   else
2371     coding->eol_type = CODING_EOL_LF;
2372
2373   type = XVECTOR (coding_system)->contents[0];
2374   switch (XFASTINT (type))
2375     {
2376     case 0:
2377       coding->type = coding_type_emacs_mule;
2378       break;
2379
2380     case 1:
2381       coding->type = coding_type_sjis;
2382       break;
2383
2384     case 2:
2385       coding->type = coding_type_iso2022;
2386       {
2387         Lisp_Object val;
2388         Lisp_Object *flags;
2389         int i, charset, default_reg_bits = 0;
2390
2391         val = XVECTOR (coding_system)->contents[4];
2392
2393         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2394           goto label_invalid_coding_system;
2395
2396         flags = XVECTOR (val)->contents;
2397         coding->flags
2398           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2399              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2400              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2401              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2402              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2403              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2404              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2405              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2406              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2407              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2408              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2409              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2410              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2411              );
2412
2413         /* Invoke graphic register 0 to plane 0.  */
2414         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2415         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2416         CODING_SPEC_ISO_INVOCATION (coding, 1)
2417           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2418         /* Not single shifting at first.  */
2419         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2420         /* Beginning of buffer should also be regarded as bol. */
2421         CODING_SPEC_ISO_BOL (coding) = 1;
2422
2423         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2424            FLAGS[REG] can be one of below:
2425                 integer CHARSET: CHARSET occupies register I,
2426                 t: designate nothing to REG initially, but can be used
2427                   by any charsets,
2428                 list of integer, nil, or t: designate the first
2429                   element (if integer) to REG initially, the remaining
2430                   elements (if integer) is designated to REG on request,
2431                   if an element is t, REG can be used by any charset,
2432                 nil: REG is never used.  */
2433         for (charset = 0; charset <= MAX_CHARSET; charset++)
2434           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2435             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2436         bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
2437         for (i = 0; i < 4; i++)
2438           {
2439             if (INTEGERP (flags[i])
2440                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2441                 || (charset = get_charset_id (flags[i])) >= 0)
2442               {
2443                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2444                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2445                 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2446               }
2447             else if (EQ (flags[i], Qt))
2448               {
2449                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2450                 default_reg_bits |= 1 << i;
2451               }
2452             else if (CONSP (flags[i]))
2453               {
2454                 Lisp_Object tail = flags[i];
2455
2456                 if (INTEGERP (XCONS (tail)->car)
2457                     && (charset = XINT (XCONS (tail)->car),
2458                         CHARSET_VALID_P (charset))
2459                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2460                   {
2461                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2462                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2463                     CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2464                   }
2465                 else
2466                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2467                 tail = XCONS (tail)->cdr;
2468                 while (CONSP (tail))
2469                   {
2470                     if (INTEGERP (XCONS (tail)->car)
2471                         && (charset = XINT (XCONS (tail)->car),
2472                             CHARSET_VALID_P (charset))
2473                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2474                       {
2475                         CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2476                           = i;
2477                         CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2478                           = 1;
2479                       }
2480                     else if (EQ (XCONS (tail)->car, Qt))
2481                       default_reg_bits |= 1 << i;
2482                     tail = XCONS (tail)->cdr;
2483                   }
2484               }
2485             else
2486               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2487
2488             CODING_SPEC_ISO_DESIGNATION (coding, i)
2489               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2490           }
2491
2492         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2493           {
2494             /* REG 1 can be used only by locking shift in 7-bit env.  */
2495             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2496               default_reg_bits &= ~2;
2497             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2498               /* Without any shifting, only REG 0 and 1 can be used.  */
2499               default_reg_bits &= 3;
2500           }
2501
2502         for (charset = 0; charset <= MAX_CHARSET; charset++)
2503           if (CHARSET_VALID_P (charset)
2504               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2505                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2506             {
2507               /* We have not yet decided where to designate CHARSET.  */
2508               int reg_bits = default_reg_bits;
2509
2510               if (CHARSET_CHARS (charset) == 96)
2511                 /* A charset of CHARS96 can't be designated to REG 0.  */
2512                 reg_bits &= ~1;
2513
2514               if (reg_bits)
2515                 /* There exist some default graphic register.  */
2516                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2517                   = (reg_bits & 1
2518                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2519               else
2520                 /* We anyway have to designate CHARSET to somewhere.  */
2521                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2522                   = (CHARSET_CHARS (charset) == 94
2523                      ? 0
2524                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2525                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2526                         ? 1
2527                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2528                            ? 2 : 0)));
2529             }
2530       }
2531       coding->require_flushing = 1;
2532       break;
2533
2534     case 3:
2535       coding->type = coding_type_big5;
2536       coding->flags
2537         = (NILP (XVECTOR (coding_system)->contents[4])
2538            ? CODING_FLAG_BIG5_HKU
2539            : CODING_FLAG_BIG5_ETEN);
2540       break;
2541
2542     case 4:
2543       coding->type = coding_type_ccl;
2544       {
2545         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2546         if (CONSP  (val)
2547             && VECTORP (XCONS (val)->car)
2548             && VECTORP (XCONS (val)->cdr))
2549           {
2550             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2551             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2552           }
2553         else
2554           goto label_invalid_coding_system;
2555       }
2556       coding->require_flushing = 1;
2557       break;
2558
2559     case 5:
2560       coding->type = coding_type_raw_text;
2561       break;
2562
2563     default:
2564       if (EQ (type, Qt))
2565         coding->type = coding_type_undecided;
2566       else
2567         coding->type = coding_type_no_conversion;
2568       break;
2569     }
2570   return 0;
2571
2572  label_invalid_coding_system:
2573   coding->type = coding_type_no_conversion;
2574   coding->eol_type = CODING_EOL_LF;
2575   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2576     = Qnil;
2577   return -1;
2578 }
2579
2580 /* Emacs has a mechanism to automatically detect a coding system if it
2581    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2582    it's impossible to distinguish some coding systems accurately
2583    because they use the same range of codes.  So, at first, coding
2584    systems are categorized into 7, those are:
2585
2586    o coding-category-emacs-mule
2587
2588         The category for a coding system which has the same code range
2589         as Emacs' internal format.  Assigned the coding-system (Lisp
2590         symbol) `emacs-mule' by default.
2591
2592    o coding-category-sjis
2593
2594         The category for a coding system which has the same code range
2595         as SJIS.  Assigned the coding-system (Lisp
2596         symbol) `japanese-shift-jis' by default.
2597
2598    o coding-category-iso-7
2599
2600         The category for a coding system which has the same code range
2601         as ISO2022 of 7-bit environment.  This doesn't use any locking
2602         shift and single shift functions.  Assigned the coding-system
2603         (Lisp symbol) `iso-2022-7bit' by default.
2604
2605    o coding-category-iso-8-1
2606
2607         The category for a coding system which has the same code range
2608         as ISO2022 of 8-bit environment and graphic plane 1 used only
2609         for DIMENSION1 charset.  This doesn't use any locking shift
2610         and single shift functions.  Assigned the coding-system (Lisp
2611         symbol) `iso-latin-1' by default.
2612
2613    o coding-category-iso-8-2
2614
2615         The category for a coding system which has the same code range
2616         as ISO2022 of 8-bit environment and graphic plane 1 used only
2617         for DIMENSION2 charset.  This doesn't use any locking shift
2618         and single shift functions.  Assigned the coding-system (Lisp
2619         symbol) `japanese-iso-8bit' by default.
2620
2621    o coding-category-iso-7-else
2622
2623         The category for a coding system which has the same code range
2624         as ISO2022 of 7-bit environemnt but uses locking shift or
2625         single shift functions.  Assigned the coding-system (Lisp
2626         symbol) `iso-2022-7bit-lock' by default.
2627
2628    o coding-category-iso-8-else
2629
2630         The category for a coding system which has the same code range
2631         as ISO2022 of 8-bit environemnt but uses locking shift or
2632         single shift functions.  Assigned the coding-system (Lisp
2633         symbol) `iso-2022-8bit-ss2' by default.
2634
2635    o coding-category-big5
2636
2637         The category for a coding system which has the same code range
2638         as BIG5.  Assigned the coding-system (Lisp symbol)
2639         `cn-big5' by default.
2640
2641    o coding-category-binary
2642
2643         The category for a coding system not categorized in any of the
2644         above.  Assigned the coding-system (Lisp symbol)
2645         `no-conversion' by default.
2646
2647    Each of them is a Lisp symbol and the value is an actual
2648    `coding-system's (this is also a Lisp symbol) assigned by a user.
2649    What Emacs does actually is to detect a category of coding system.
2650    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2651    decide only one possible category, it selects a category of the
2652    highest priority.  Priorities of categories are also specified by a
2653    user in a Lisp variable `coding-category-list'.
2654
2655 */
2656
2657 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2658    If it detects possible coding systems, return an integer in which
2659    appropriate flag bits are set.  Flag bits are defined by macros
2660    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2661
2662 int
2663 detect_coding_mask (src, src_bytes)
2664      unsigned char *src;
2665      int src_bytes;
2666 {
2667   register unsigned char c;
2668   unsigned char *src_end = src + src_bytes;
2669   int mask;
2670
2671   /* At first, skip all ASCII characters and control characters except
2672      for three ISO2022 specific control characters.  */
2673  label_loop_detect_coding:
2674   while (src < src_end)
2675     {
2676       c = *src;
2677       if (c >= 0x80
2678           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2679         break;
2680       src++;
2681     }
2682
2683   if (src >= src_end)
2684     /* We found nothing other than ASCII.  There's nothing to do.  */
2685     return CODING_CATEGORY_MASK_ANY;
2686
2687   /* The text seems to be encoded in some multilingual coding system.
2688      Now, try to find in which coding system the text is encoded.  */
2689   if (c < 0x80)
2690     {
2691       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2692       /* C is an ISO2022 specific control code of C0.  */
2693       mask = detect_coding_iso2022 (src, src_end);
2694       src++;
2695       if (mask == 0)
2696         /* No valid ISO2022 code follows C.  Try again.  */
2697         goto label_loop_detect_coding;
2698       mask |= CODING_CATEGORY_MASK_RAW_TEXT;
2699     }
2700   else if (c < 0xA0)
2701     {
2702       /* If C is a special latin extra code,
2703          or is an ISO2022 specific control code of C1 (SS2 or SS3),
2704          or is an ISO2022 control-sequence-introducer (CSI),
2705          we should also consider the possibility of ISO2022 codings.  */
2706       if ((VECTORP (Vlatin_extra_code_table)
2707            && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2708           || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2709           || (c == ISO_CODE_CSI
2710               && (src < src_end
2711                   && (*src == ']'
2712                       || (src + 1 < src_end
2713                           && src[1] == ']'
2714                           && (*src == '0' || *src == '1' || *src == '2'))))))
2715         mask = (detect_coding_iso2022 (src, src_end)
2716                 | detect_coding_sjis (src, src_end)
2717                 | detect_coding_emacs_mule (src, src_end)
2718                 | CODING_CATEGORY_MASK_RAW_TEXT);
2719
2720       else
2721         /* C is the first byte of SJIS character code,
2722            or a leading-code of Emacs' internal format (emacs-mule).  */
2723         mask = (detect_coding_sjis (src, src_end)
2724                 | detect_coding_emacs_mule (src, src_end)
2725                 | CODING_CATEGORY_MASK_RAW_TEXT);
2726     }
2727   else
2728     /* C is a character of ISO2022 in graphic plane right,
2729        or a SJIS's 1-byte character code (i.e. JISX0201),
2730        or the first byte of BIG5's 2-byte code.  */
2731     mask = (detect_coding_iso2022 (src, src_end)
2732             | detect_coding_sjis (src, src_end)
2733             | detect_coding_big5 (src, src_end)
2734             | CODING_CATEGORY_MASK_RAW_TEXT);
2735
2736   return mask;
2737 }
2738
2739 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2740    The information of the detected coding system is set in CODING.  */
2741
2742 void
2743 detect_coding (coding, src, src_bytes)
2744      struct coding_system *coding;
2745      unsigned char *src;
2746      int src_bytes;
2747 {
2748   int mask = detect_coding_mask (src, src_bytes);
2749   int idx;
2750   Lisp_Object val = Vcoding_category_list;
2751
2752   if (mask == CODING_CATEGORY_MASK_ANY)
2753     /* We found nothing other than ASCII.  There's nothing to do.  */
2754     return;
2755
2756   /* We found some plausible coding systems.  Let's use a coding
2757      system of the highest priority.  */
2758
2759   if (CONSP (val))
2760     while (!NILP (val))
2761       {
2762         idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2763         if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2764           break;
2765         val = XCONS (val)->cdr;
2766       }
2767   else
2768     val = Qnil;
2769
2770   if (NILP (val))
2771     {
2772       /* For unknown reason, `Vcoding_category_list' contains none of
2773          found categories.  Let's use any of them.  */
2774       for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2775         if (mask & (1 << idx))
2776           break;
2777     }
2778   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2779 }
2780
2781 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2782    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2783    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2784
2785 #define MAX_EOL_CHECK_COUNT 3
2786
2787 int
2788 detect_eol_type (src, src_bytes)
2789      unsigned char *src;
2790      int src_bytes;
2791 {
2792   unsigned char *src_end = src + src_bytes;
2793   unsigned char c;
2794   int total = 0;                /* How many end-of-lines are found so far.  */
2795   int eol_type = CODING_EOL_UNDECIDED;
2796   int this_eol_type;
2797
2798   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2799     {
2800       c = *src++;
2801       if (c == '\n' || c == '\r')
2802         {
2803           total++;
2804           if (c == '\n')
2805             this_eol_type = CODING_EOL_LF;
2806           else if (src >= src_end || *src != '\n')
2807             this_eol_type = CODING_EOL_CR;
2808           else
2809             this_eol_type = CODING_EOL_CRLF, src++;
2810
2811           if (eol_type == CODING_EOL_UNDECIDED)
2812             /* This is the first end-of-line.  */
2813             eol_type = this_eol_type;
2814           else if (eol_type != this_eol_type)
2815             /* The found type is different from what found before.
2816                Let's notice the caller about this inconsistency.  */
2817             return CODING_EOL_INCONSISTENT;
2818         }
2819     }
2820
2821   return eol_type;
2822 }
2823
2824 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2825    is encoded.  If it detects an appropriate format of end-of-line, it
2826    sets the information in *CODING.  */
2827
2828 void
2829 detect_eol (coding, src, src_bytes)
2830      struct coding_system *coding;
2831      unsigned char *src;
2832      int src_bytes;
2833 {
2834   Lisp_Object val, coding_system;
2835   int eol_type = detect_eol_type (src, src_bytes);
2836
2837   if (eol_type == CODING_EOL_UNDECIDED)
2838     /*  We found no end-of-line in the source text.  */
2839     return;
2840
2841   if (eol_type == CODING_EOL_INCONSISTENT)
2842     {
2843 #if 0
2844       /* This code is suppressed until we find a better way to
2845          distinguish raw text file and binary file.  */
2846
2847       /* If we have already detected that the coding is raw-text, the
2848          coding should actually be no-conversion.  */
2849       if (coding->type == coding_type_raw_text)
2850         {
2851           setup_coding_system (Qno_conversion, coding);
2852           return;
2853         }
2854       /* Else, let's decode only text code anyway.  */
2855 #endif /* 0 */
2856       eol_type = CODING_EOL_LF;
2857     }
2858
2859   coding_system = coding->symbol;
2860   while (!NILP (coding_system)
2861          && NILP (val = Fget (coding_system, Qeol_type)))
2862     coding_system = Fget (coding_system, Qcoding_system);
2863   if (VECTORP (val) && XVECTOR (val)->size == 3)
2864     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2865 }
2866
2867 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2868    decoding, it may detect coding system and format of end-of-line if
2869    those are not yet decided.  */
2870
2871 int
2872 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2873      struct coding_system *coding;
2874      unsigned char *source, *destination;
2875      int src_bytes, dst_bytes;
2876      int *consumed;
2877 {
2878   int produced;
2879
2880   if (src_bytes <= 0)
2881     {
2882       *consumed = 0;
2883       return 0;
2884     }
2885
2886   if (coding->type == coding_type_undecided)
2887     detect_coding (coding, source, src_bytes);
2888
2889   if (coding->eol_type == CODING_EOL_UNDECIDED)
2890     detect_eol (coding, source, src_bytes);
2891
2892   coding->carryover_size = 0;
2893   switch (coding->type)
2894     {
2895     case coding_type_no_conversion:
2896     label_no_conversion:
2897       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2898       bcopy (source, destination, produced);
2899       *consumed = produced;
2900       break;
2901
2902     case coding_type_emacs_mule:
2903     case coding_type_undecided:
2904     case coding_type_raw_text:
2905       if (coding->eol_type == CODING_EOL_LF
2906           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2907         goto label_no_conversion;
2908       produced = decode_eol (coding, source, destination,
2909                              src_bytes, dst_bytes, consumed);
2910       break;
2911
2912     case coding_type_sjis:
2913       produced = decode_coding_sjis_big5 (coding, source, destination,
2914                                           src_bytes, dst_bytes, consumed,
2915                                           1);
2916       break;
2917
2918     case coding_type_iso2022:
2919       produced = decode_coding_iso2022 (coding, source, destination,
2920                                         src_bytes, dst_bytes, consumed);
2921       break;
2922
2923     case coding_type_big5:
2924       produced = decode_coding_sjis_big5 (coding, source, destination,
2925                                           src_bytes, dst_bytes, consumed,
2926                                           0);
2927       break;
2928
2929     case coding_type_ccl:
2930       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2931                              src_bytes, dst_bytes, consumed);
2932       break;
2933     }
2934
2935   return produced;
2936 }
2937
2938 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2939
2940 int
2941 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2942      struct coding_system *coding;
2943      unsigned char *source, *destination;
2944      int src_bytes, dst_bytes;
2945      int *consumed;
2946 {
2947   int produced;
2948
2949   switch (coding->type)
2950     {
2951     case coding_type_no_conversion:
2952     label_no_conversion:
2953       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2954       if (produced > 0)
2955         {
2956           bcopy (source, destination, produced);
2957           if (coding->selective)
2958             {
2959               unsigned char *p = destination, *pend = destination + produced;
2960               while (p < pend)
2961                 if (*p++ == '\015') p[-1] = '\n';
2962             }
2963         }
2964       *consumed = produced;
2965       break;
2966
2967     case coding_type_emacs_mule:
2968     case coding_type_undecided:
2969     case coding_type_raw_text:
2970       if (coding->eol_type == CODING_EOL_LF
2971           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2972         goto label_no_conversion;
2973       produced = encode_eol (coding, source, destination,
2974                              src_bytes, dst_bytes, consumed);
2975       break;
2976
2977     case coding_type_sjis:
2978       produced = encode_coding_sjis_big5 (coding, source, destination,
2979                                           src_bytes, dst_bytes, consumed,
2980                                           1);
2981       break;
2982
2983     case coding_type_iso2022:
2984       produced = encode_coding_iso2022 (coding, source, destination,
2985                                         src_bytes, dst_bytes, consumed);
2986       break;
2987
2988     case coding_type_big5:
2989       produced = encode_coding_sjis_big5 (coding, source, destination,
2990                                           src_bytes, dst_bytes, consumed,
2991                                           0);
2992       break;
2993
2994     case coding_type_ccl:
2995       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2996                              src_bytes, dst_bytes, consumed);
2997       break;
2998     }
2999
3000   return produced;
3001 }
3002
3003 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3004
3005 /* Return maximum size (bytes) of a buffer enough for decoding
3006    SRC_BYTES of text encoded in CODING.  */
3007
3008 int
3009 decoding_buffer_size (coding, src_bytes)
3010      struct coding_system *coding;
3011      int src_bytes;
3012 {
3013   int magnification;
3014
3015   if (coding->type == coding_type_iso2022)
3016     magnification = 3;
3017   else if (coding->type == coding_type_ccl)
3018     magnification = coding->spec.ccl.decoder.buf_magnification;
3019   else
3020     magnification = 2;
3021
3022   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3023 }
3024
3025 /* Return maximum size (bytes) of a buffer enough for encoding
3026    SRC_BYTES of text to CODING.  */
3027
3028 int
3029 encoding_buffer_size (coding, src_bytes)
3030      struct coding_system *coding;
3031      int src_bytes;
3032 {
3033   int magnification;
3034
3035   if (coding->type == coding_type_ccl)
3036     magnification = coding->spec.ccl.encoder.buf_magnification;
3037   else
3038     magnification = 3;
3039
3040   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3041 }
3042
3043 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3044 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3045 #endif
3046
3047 char *conversion_buffer;
3048 int conversion_buffer_size;
3049
3050 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3051    or decoding.  Sufficient memory is allocated automatically.  If we
3052    run out of memory, return NULL.  */
3053
3054 char *
3055 get_conversion_buffer (size)
3056      int size;
3057 {
3058   if (size > conversion_buffer_size)
3059     {
3060       char *buf;
3061       int real_size = conversion_buffer_size * 2;
3062
3063       while (real_size < size) real_size *= 2;
3064       buf = (char *) xmalloc (real_size);
3065       xfree (conversion_buffer);
3066       conversion_buffer = buf;
3067       conversion_buffer_size = real_size;
3068     }
3069   return conversion_buffer;
3070 }
3071
3072 \f
3073 #ifdef emacs
3074 /*** 7. Emacs Lisp library functions ***/
3075
3076 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3077        1, 1, 0,
3078   "Return coding-spec of CODING-SYSTEM.\n\
3079 If CODING-SYSTEM is not a valid coding-system, return nil.")
3080   (obj)
3081      Lisp_Object obj;
3082 {
3083   while (SYMBOLP (obj) && !NILP (obj))
3084     obj = Fget (obj, Qcoding_system);
3085   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3086           ? Qnil : obj);
3087 }
3088
3089 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3090   "Return t if OBJECT is nil or a coding-system.\n\
3091 See document of make-coding-system for coding-system object.")
3092   (obj)
3093      Lisp_Object obj;
3094 {
3095   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3096 }
3097
3098 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3099        Sread_non_nil_coding_system, 1, 1, 0,
3100   "Read a coding system from the minibuffer, prompting with string PROMPT.")
3101   (prompt)
3102      Lisp_Object prompt;
3103 {
3104   Lisp_Object val;
3105   do
3106     {
3107       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3108                               Qt, Qnil, Qnil, Qnil, Qnil);
3109     }
3110   while (XSTRING (val)->size == 0);
3111   return (Fintern (val, Qnil));
3112 }
3113
3114 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3115   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3116   (prompt)
3117      Lisp_Object prompt;
3118 {
3119   Lisp_Object val;
3120   val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3121                           Qt, Qnil, Qnil, Qnil, Qnil);
3122   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3123 }
3124
3125 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3126        1, 1, 0,
3127   "Check validity of CODING-SYSTEM.\n\
3128 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3129 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3130 The value of property should be a vector of length 5.")
3131   (coding_system)
3132      Lisp_Object coding_system;
3133 {
3134   CHECK_SYMBOL (coding_system, 0);
3135   if (!NILP (Fcoding_system_p (coding_system)))
3136     return coding_system;
3137   while (1)
3138     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3139 }
3140
3141 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3142        2, 2, 0,
3143   "Detect coding system of the text in the region between START and END.\n\
3144 Return a list of possible coding systems ordered by priority.\n\
3145 If only ASCII characters are found, it returns `undecided'\n\
3146  or its subsidiary coding system according to a detected end-of-line format.")
3147   (b, e)
3148      Lisp_Object b, e;
3149 {
3150   int coding_mask, eol_type;
3151   Lisp_Object val;
3152   int beg, end;
3153
3154   validate_region (&b, &e);
3155   beg = XINT (b), end = XINT (e);
3156   if (beg < GPT && end >= GPT) move_gap (end);
3157
3158   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3159   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
3160
3161   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3162     {
3163       val = Qundecided;
3164       if (eol_type != CODING_EOL_UNDECIDED
3165           && eol_type != CODING_EOL_INCONSISTENT)
3166         {
3167           Lisp_Object val2;
3168           val2 = Fget (Qundecided, Qeol_type);
3169           if (VECTORP (val2))
3170             val = XVECTOR (val2)->contents[eol_type];
3171         }
3172     }
3173   else
3174     {
3175       Lisp_Object val2;
3176
3177       /* At first, gather possible coding-systems in VAL in a reverse
3178          order.  */
3179       val = Qnil;
3180       for (val2 = Vcoding_category_list;
3181            !NILP (val2);
3182            val2 = XCONS (val2)->cdr)
3183         {
3184           int idx
3185             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3186           if (coding_mask & (1 << idx))
3187             {
3188 #if 0
3189               /* This code is suppressed until we find a better way to
3190                  distinguish raw text file and binary file.  */
3191
3192               if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3193                   && eol_type == CODING_EOL_INCONSISTENT)
3194                 val = Fcons (Qno_conversion, val);
3195               else
3196 #endif /* 0 */
3197                 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3198             }
3199         }
3200
3201       /* Then, change the order of the list, while getting subsidiary
3202          coding-systems.  */
3203       val2 = val;
3204       val = Qnil;
3205       if (eol_type == CODING_EOL_INCONSISTENT)
3206         eol_type == CODING_EOL_UNDECIDED;
3207       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3208         {
3209           if (eol_type == CODING_EOL_UNDECIDED)
3210             val = Fcons (XCONS (val2)->car, val);
3211           else
3212             {
3213               Lisp_Object val3;
3214               val3 = Fget (XCONS (val2)->car, Qeol_type);
3215               if (VECTORP (val3))
3216                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3217               else
3218                 val = Fcons (XCONS (val2)->car, val);
3219             }
3220         }
3221     }
3222
3223   return val;
3224 }
3225
3226 /* Scan text in the region between *BEGP and *ENDP, skip characters
3227    which we never have to encode to (iff ENCODEP is 1) or decode from
3228    coding system CODING at the head and tail, then set BEGP and ENDP
3229    to the addresses of start and end of the text we actually convert.  */
3230
3231 void
3232 shrink_conversion_area (begp, endp, coding, encodep)
3233      unsigned char **begp, **endp;
3234      struct coding_system *coding;
3235      int encodep;
3236 {
3237   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3238
3239   if (coding->eol_type != CODING_EOL_LF
3240       && coding->eol_type != CODING_EOL_UNDECIDED)
3241     /* Since we anyway have to convert end-of-line format, it is not
3242        worth skipping at most 100 bytes or so.  */
3243     return;
3244
3245   if (encodep)                  /* for encoding */
3246     {
3247       switch (coding->type)
3248         {
3249         case coding_type_no_conversion:
3250         case coding_type_emacs_mule:
3251         case coding_type_undecided:
3252         case coding_type_raw_text:
3253           /* We need no conversion.  */
3254           *begp = *endp;
3255           return;
3256         case coding_type_ccl:
3257           /* We can't skip any data.  */
3258           return;
3259         case coding_type_iso2022:
3260           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3261             {
3262               unsigned char *bol = beg_addr;
3263               while (beg_addr < end_addr && *beg_addr < 0x80)
3264                 {
3265                   beg_addr++;
3266                   if (*(beg_addr - 1) == '\n')
3267                     bol = beg_addr;
3268                 }
3269               beg_addr = bol;
3270               goto label_skip_tail;
3271             }
3272           /* fall down ... */
3273         default:
3274           /* We can skip all ASCII characters at the head and tail.  */
3275           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3276         label_skip_tail:
3277           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3278           break;
3279         }
3280     }
3281   else                          /* for decoding */
3282     {
3283       switch (coding->type)
3284         {
3285         case coding_type_no_conversion:
3286           /* We need no conversion.  */
3287           *begp = *endp;
3288           return;
3289         case coding_type_emacs_mule:
3290         case coding_type_raw_text:
3291           if (coding->eol_type == CODING_EOL_LF)
3292             {
3293               /* We need no conversion.  */
3294               *begp = *endp;
3295               return;
3296             }
3297           /* We can skip all but carriage-return.  */
3298           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3299           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3300           break;
3301         case coding_type_sjis:
3302         case coding_type_big5:
3303           /* We can skip all ASCII characters at the head.  */
3304           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3305           /* We can skip all ASCII characters at the tail except for
3306              the second byte of SJIS or BIG5 code.  */
3307           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3308           if (end_addr != *endp)
3309             end_addr++;
3310           break;
3311         case coding_type_ccl:
3312           /* We can't skip any data.  */
3313           return;
3314         default:                /* i.e. case coding_type_iso2022: */
3315           {
3316             unsigned char c;
3317
3318             /* We can skip all ASCII characters except for a few
3319                control codes at the head.  */
3320             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3321                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3322                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3323               beg_addr++;
3324           }
3325           break;
3326         }
3327     }
3328   *begp = beg_addr;
3329   *endp = end_addr;
3330   return;
3331 }
3332
3333 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3334    text between B and E.  B and E are buffer position.  */
3335
3336 Lisp_Object
3337 code_convert_region (b, e, coding, encodep)
3338      Lisp_Object b, e;
3339      struct coding_system *coding;
3340      int encodep;
3341 {
3342   int beg, end, len, consumed, produced;
3343   char *buf;
3344   unsigned char *begp, *endp;
3345   int pos = PT;
3346
3347   validate_region (&b, &e);
3348   beg = XINT (b), end = XINT (e);
3349   if (beg < GPT && end >= GPT)
3350     move_gap (end);
3351
3352   if (encodep && !NILP (coding->pre_write_conversion))
3353     {
3354       /* We must call a pre-conversion function which may put a new
3355          text to be converted in a new buffer.  */
3356       struct buffer *old = current_buffer, *new;
3357
3358       TEMP_SET_PT (beg);
3359       call2 (coding->pre_write_conversion, b, e);
3360       if (old != current_buffer)
3361         {
3362           /* Replace the original text by the text just generated.  */
3363           len = ZV - BEGV;
3364           new = current_buffer;
3365           set_buffer_internal (old);
3366           del_range (beg, end);
3367           insert_from_buffer (new, 1, len, 0);
3368           end = beg + len;
3369         }
3370     }
3371
3372   /* We may be able to shrink the conversion region.  */
3373   begp = POS_ADDR (beg); endp = begp + (end - beg);
3374   shrink_conversion_area (&begp, &endp, coding, encodep);
3375
3376   if (begp == endp)
3377     /* We need no conversion.  */
3378     len = end - beg;
3379   else
3380     {
3381       beg += begp - POS_ADDR (beg);
3382       end =  beg + (endp - begp);
3383
3384       if (encodep)
3385         len = encoding_buffer_size (coding, end - beg);
3386       else
3387         len = decoding_buffer_size (coding, end - beg);
3388       buf = get_conversion_buffer (len);
3389
3390       coding->last_block = 1;
3391       produced = (encodep
3392                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3393                                    &consumed)
3394                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3395                                    &consumed));
3396
3397       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3398
3399       TEMP_SET_PT (beg);
3400       insert (buf, produced);
3401       del_range (PT, PT + end - beg);
3402       if (pos >= end)
3403         pos = PT + (pos - end);
3404       else if (pos > beg)
3405         pos = beg;
3406       TEMP_SET_PT (pos);
3407   }
3408
3409   if (!encodep && !NILP (coding->post_read_conversion))
3410     {
3411       /* We must call a post-conversion function which may alter
3412          the text just converted.  */
3413       Lisp_Object insval;
3414
3415       beg = XINT (b);
3416       TEMP_SET_PT (beg);
3417       insval = call1 (coding->post_read_conversion, make_number (len));
3418       CHECK_NUMBER (insval, 0);
3419       len = XINT (insval);
3420     }
3421
3422   return make_number (len);
3423 }
3424
3425 Lisp_Object
3426 code_convert_string (str, coding, encodep, nocopy)
3427      Lisp_Object str, nocopy;
3428      struct coding_system *coding;
3429      int encodep;
3430 {
3431   int len, consumed, produced;
3432   char *buf;
3433   unsigned char *begp, *endp;
3434   int head_skip, tail_skip;
3435   struct gcpro gcpro1;
3436
3437   if (encodep && !NILP (coding->pre_write_conversion)
3438       || !encodep && !NILP (coding->post_read_conversion))
3439     {
3440       /* Since we have to call Lisp functions which assume target text
3441          is in a buffer, after setting a temporary buffer, call
3442          code_convert_region.  */
3443       int count = specpdl_ptr - specpdl;
3444       int len = XSTRING (str)->size;
3445       Lisp_Object result;
3446       struct buffer *old = current_buffer;
3447
3448       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3449       temp_output_buffer_setup (" *code-converting-work*");
3450       set_buffer_internal (XBUFFER (Vstandard_output));
3451       insert_from_string (str, 0, len, 0);
3452       code_convert_region (make_number (BEGV), make_number (ZV),
3453                            coding, encodep);
3454       result = make_buffer_string (BEGV, ZV, 0);
3455       set_buffer_internal (old);
3456       return unbind_to (count, result);
3457     }
3458
3459   /* We may be able to shrink the conversion region.  */
3460   begp = XSTRING (str)->data;
3461   endp = begp + XSTRING (str)->size;
3462   shrink_conversion_area (&begp, &endp, coding, encodep);
3463
3464   if (begp == endp)
3465     /* We need no conversion.  */
3466     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3467
3468   head_skip = begp - XSTRING (str)->data;
3469   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3470
3471   GCPRO1 (str);
3472
3473   if (encodep)
3474     len = encoding_buffer_size (coding, endp - begp);
3475   else
3476     len = decoding_buffer_size (coding, endp - begp);
3477   buf = get_conversion_buffer (len + head_skip + tail_skip);
3478
3479   bcopy (XSTRING (str)->data, buf, head_skip);
3480   coding->last_block = 1;
3481   produced = (encodep
3482               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3483                                buf + head_skip, endp - begp, len, &consumed)
3484               : decode_coding (coding, XSTRING (str)->data + head_skip,
3485                                buf + head_skip, endp - begp, len, &consumed));
3486   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3487          buf + head_skip + produced,
3488          tail_skip);
3489
3490   UNGCPRO;
3491
3492   return make_string (buf, head_skip + produced + tail_skip);
3493 }
3494
3495 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3496        3, 3, "r\nzCoding system: ",
3497   "Decode current region by specified coding system.\n\
3498 When called from a program, takes three arguments:\n\
3499 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3500 Return length of decoded text.")
3501   (b, e, coding_system)
3502      Lisp_Object b, e, coding_system;
3503 {
3504   struct coding_system coding;
3505
3506   CHECK_NUMBER_COERCE_MARKER (b, 0);
3507   CHECK_NUMBER_COERCE_MARKER (e, 1);
3508   CHECK_SYMBOL (coding_system, 2);
3509
3510   if (NILP (coding_system))
3511     return make_number (XFASTINT (e) - XFASTINT (b));
3512   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3513     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3514
3515   return code_convert_region (b, e, &coding, 0);
3516 }
3517
3518 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3519        3, 3, "r\nzCoding system: ",
3520   "Encode current region by specified coding system.\n\
3521 When called from a program, takes three arguments:\n\
3522 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3523 Return length of encoded text.")
3524   (b, e, coding_system)
3525      Lisp_Object b, e, coding_system;
3526 {
3527   struct coding_system coding;
3528
3529   CHECK_NUMBER_COERCE_MARKER (b, 0);
3530   CHECK_NUMBER_COERCE_MARKER (e, 1);
3531   CHECK_SYMBOL (coding_system, 2);
3532
3533   if (NILP (coding_system))
3534     return make_number (XFASTINT (e) - XFASTINT (b));
3535   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3536     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3537
3538   return code_convert_region (b, e, &coding, 1);
3539 }
3540
3541 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3542        2, 3, 0,
3543   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3544 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3545 of decoding.")
3546   (string, coding_system, nocopy)
3547      Lisp_Object string, coding_system, nocopy;
3548 {
3549   struct coding_system coding;
3550
3551   CHECK_STRING (string, 0);
3552   CHECK_SYMBOL (coding_system, 1);
3553
3554   if (NILP (coding_system))
3555     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3556   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3557     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3558
3559   return code_convert_string (string, &coding, 0, nocopy);
3560 }
3561
3562 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3563        2, 3, 0,
3564   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3565 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3566 of encoding.")
3567   (string, coding_system, nocopy)
3568      Lisp_Object string, coding_system, nocopy;
3569 {
3570   struct coding_system coding;
3571
3572   CHECK_STRING (string, 0);
3573   CHECK_SYMBOL (coding_system, 1);
3574
3575   if (NILP (coding_system))
3576     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3577   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3578     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3579
3580   return code_convert_string (string, &coding, 1, nocopy);
3581 }
3582
3583 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3584   "Decode a JISX0208 character of shift-jis encoding.\n\
3585 CODE is the character code in SJIS.\n\
3586 Return the corresponding character.")
3587   (code)
3588      Lisp_Object code;
3589 {
3590   unsigned char c1, c2, s1, s2;
3591   Lisp_Object val;
3592
3593   CHECK_NUMBER (code, 0);
3594   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3595   DECODE_SJIS (s1, s2, c1, c2);
3596   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3597   return val;
3598 }
3599
3600 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3601   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3602 Return the corresponding character code in SJIS.")
3603   (ch)
3604      Lisp_Object ch;
3605 {
3606   int charset, c1, c2, s1, s2;
3607   Lisp_Object val;
3608
3609   CHECK_NUMBER (ch, 0);
3610   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3611   if (charset == charset_jisx0208)
3612     {
3613       ENCODE_SJIS (c1, c2, s1, s2);
3614       XSETFASTINT (val, (s1 << 8) | s2);
3615     }
3616   else
3617     XSETFASTINT (val, 0);
3618   return val;
3619 }
3620
3621 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3622   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3623 CODE is the character code in BIG5.\n\
3624 Return the corresponding character.")
3625   (code)
3626      Lisp_Object code;
3627 {
3628   int charset;
3629   unsigned char b1, b2, c1, c2;
3630   Lisp_Object val;
3631
3632   CHECK_NUMBER (code, 0);
3633   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3634   DECODE_BIG5 (b1, b2, charset, c1, c2);
3635   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3636   return val;
3637 }
3638
3639 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3640   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3641 Return the corresponding character code in Big5.")
3642   (ch)
3643      Lisp_Object ch;
3644 {
3645   int charset, c1, c2, b1, b2;
3646   Lisp_Object val;
3647
3648   CHECK_NUMBER (ch, 0);
3649   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3650   if (charset == charset_big5_1 || charset == charset_big5_2)
3651     {
3652       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3653       XSETFASTINT (val, (b1 << 8) | b2);
3654     }
3655   else
3656     XSETFASTINT (val, 0);
3657   return val;
3658 }
3659
3660 DEFUN ("set-terminal-coding-system-internal",
3661        Fset_terminal_coding_system_internal,
3662        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3663   (coding_system)
3664      Lisp_Object coding_system;
3665 {
3666   CHECK_SYMBOL (coding_system, 0);
3667   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3668   /* We had better not send unexpected characters to terminal.  */
3669   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3670
3671   return Qnil;
3672 }
3673
3674 DEFUN ("set-safe-terminal-coding-system-internal",
3675        Fset_safe_terminal_coding_system_internal,
3676        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3677   (coding_system)
3678      Lisp_Object coding_system;
3679 {
3680   CHECK_SYMBOL (coding_system, 0);
3681   setup_coding_system (Fcheck_coding_system (coding_system),
3682                        &safe_terminal_coding);
3683   return Qnil;
3684 }
3685
3686 DEFUN ("terminal-coding-system",
3687        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3688   "Return coding-system of your terminal.")
3689   ()
3690 {
3691   return terminal_coding.symbol;
3692 }
3693
3694 DEFUN ("set-keyboard-coding-system-internal",
3695        Fset_keyboard_coding_system_internal,
3696        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3697   (coding_system)
3698      Lisp_Object coding_system;
3699 {
3700   CHECK_SYMBOL (coding_system, 0);
3701   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3702   return Qnil;
3703 }
3704
3705 DEFUN ("keyboard-coding-system",
3706        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3707   "Return coding-system of what is sent from terminal keyboard.")
3708   ()
3709 {
3710   return keyboard_coding.symbol;
3711 }
3712
3713 \f
3714 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3715        Sfind_operation_coding_system,  1, MANY, 0,
3716   "Choose a coding system for an operation based on the target name.\n\
3717 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3718 DECODING-SYSTEM is the coding system to use for decoding\n\
3719 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3720 for encoding (in case OPERATION does encoding).\n\
3721 \n\
3722 The first argument OPERATION specifies an I/O primitive:\n\
3723   For file I/O, `insert-file-contents' or `write-region'.\n\
3724   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3725   For network I/O, `open-network-stream'.\n\
3726 \n\
3727 The remaining arguments should be the same arguments that were passed\n\
3728 to the primitive.  Depending on which primitive, one of those arguments\n\
3729 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3730 whichever argument specifies the file name is TARGET.\n\
3731 \n\
3732 TARGET has a meaning which depends on OPERATION:\n\
3733   For file I/O, TARGET is a file name.\n\
3734   For process I/O, TARGET is a process name.\n\
3735   For network I/O, TARGET is a service name or a port number\n\
3736 \n\
3737 This function looks up what specified for TARGET in,\n\
3738 `file-coding-system-alist', `process-coding-system-alist',\n\
3739 or `network-coding-system-alist' depending on OPERATION.\n\
3740 They may specify a coding system, a cons of coding systems,\n\
3741 or a function symbol to call.\n\
3742 In the last case, we call the function with one argument,\n\
3743 which is a list of all the arguments given to this function.")
3744   (nargs, args)
3745      int nargs;
3746      Lisp_Object *args;
3747 {
3748   Lisp_Object operation, target_idx, target, val;
3749   register Lisp_Object chain;
3750
3751   if (nargs < 2)
3752     error ("Too few arguments");
3753   operation = args[0];
3754   if (!SYMBOLP (operation)
3755       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3756     error ("Invalid first arguement");
3757   if (nargs < 1 + XINT (target_idx))
3758     error ("Too few arguments for operation: %s",
3759            XSYMBOL (operation)->name->data);
3760   target = args[XINT (target_idx) + 1];
3761   if (!(STRINGP (target)
3762         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3763     error ("Invalid %dth argument", XINT (target_idx) + 1);
3764
3765   chain = ((EQ (operation, Qinsert_file_contents)
3766             || EQ (operation, Qwrite_region))
3767            ? Vfile_coding_system_alist
3768            : (EQ (operation, Qopen_network_stream)
3769               ? Vnetwork_coding_system_alist
3770               : Vprocess_coding_system_alist));
3771   if (NILP (chain))
3772     return Qnil;
3773
3774   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3775     {
3776       Lisp_Object elt;
3777       elt = XCONS (chain)->car;
3778
3779       if (CONSP (elt)
3780           && ((STRINGP (target)
3781                && STRINGP (XCONS (elt)->car)
3782                && fast_string_match (XCONS (elt)->car, target) >= 0)
3783               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3784         {
3785           val = XCONS (elt)->cdr;
3786           if (CONSP (val))
3787             return val;
3788           if (! SYMBOLP (val))
3789             return Qnil;
3790           if (! NILP (Fcoding_system_p (val)))
3791             return Fcons (val, val);
3792           if (!NILP (Ffboundp (val)))
3793             return call1 (val, Flist (nargs, args));
3794           return Qnil;
3795         }
3796     }
3797   return Qnil;
3798 }
3799
3800 #endif /* emacs */
3801
3802 \f
3803 /*** 8. Post-amble ***/
3804
3805 init_coding_once ()
3806 {
3807   int i;
3808
3809   /* Emacs' internal format specific initialize routine.  */
3810   for (i = 0; i <= 0x20; i++)
3811     emacs_code_class[i] = EMACS_control_code;
3812   emacs_code_class[0x0A] = EMACS_linefeed_code;
3813   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3814   for (i = 0x21 ; i < 0x7F; i++)
3815     emacs_code_class[i] = EMACS_ascii_code;
3816   emacs_code_class[0x7F] = EMACS_control_code;
3817   emacs_code_class[0x80] = EMACS_leading_code_composition;
3818   for (i = 0x81; i < 0xFF; i++)
3819     emacs_code_class[i] = EMACS_invalid_code;
3820   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3821   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3822   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3823   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3824
3825   /* ISO2022 specific initialize routine.  */
3826   for (i = 0; i < 0x20; i++)
3827     iso_code_class[i] = ISO_control_code;
3828   for (i = 0x21; i < 0x7F; i++)
3829     iso_code_class[i] = ISO_graphic_plane_0;
3830   for (i = 0x80; i < 0xA0; i++)
3831     iso_code_class[i] = ISO_control_code;
3832   for (i = 0xA1; i < 0xFF; i++)
3833     iso_code_class[i] = ISO_graphic_plane_1;
3834   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3835   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3836   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3837   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3838   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3839   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3840   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3841   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3842   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3843   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3844
3845   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3846   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3847
3848   setup_coding_system (Qnil, &keyboard_coding);
3849   setup_coding_system (Qnil, &terminal_coding);
3850   setup_coding_system (Qnil, &safe_terminal_coding);
3851
3852 #if defined (MSDOS) || defined (WINDOWSNT)
3853   system_eol_type = CODING_EOL_CRLF;
3854 #else
3855   system_eol_type = CODING_EOL_LF;
3856 #endif
3857 }
3858
3859 #ifdef emacs
3860
3861 syms_of_coding ()
3862 {
3863   Qtarget_idx = intern ("target-idx");
3864   staticpro (&Qtarget_idx);
3865
3866   /* Target FILENAME is the first argument.  */
3867   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3868   /* Target FILENAME is the third argument.  */
3869   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3870
3871   Qcall_process = intern ("call-process");
3872   staticpro (&Qcall_process);
3873   /* Target PROGRAM is the first argument.  */
3874   Fput (Qcall_process, Qtarget_idx, make_number (0));
3875
3876   Qcall_process_region = intern ("call-process-region");
3877   staticpro (&Qcall_process_region);
3878   /* Target PROGRAM is the third argument.  */
3879   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3880
3881   Qstart_process = intern ("start-process");
3882   staticpro (&Qstart_process);
3883   /* Target PROGRAM is the third argument.  */
3884   Fput (Qstart_process, Qtarget_idx, make_number (2));
3885
3886   Qopen_network_stream = intern ("open-network-stream");
3887   staticpro (&Qopen_network_stream);
3888   /* Target SERVICE is the fourth argument.  */
3889   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3890
3891   Qcoding_system = intern ("coding-system");
3892   staticpro (&Qcoding_system);
3893
3894   Qeol_type = intern ("eol-type");
3895   staticpro (&Qeol_type);
3896
3897   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3898   staticpro (&Qbuffer_file_coding_system);
3899
3900   Qpost_read_conversion = intern ("post-read-conversion");
3901   staticpro (&Qpost_read_conversion);
3902
3903   Qpre_write_conversion = intern ("pre-write-conversion");
3904   staticpro (&Qpre_write_conversion);
3905
3906   Qno_conversion = intern ("no-conversion");
3907   staticpro (&Qno_conversion);
3908
3909   Qundecided = intern ("undecided");
3910   staticpro (&Qundecided);
3911
3912   Qcoding_system_spec = intern ("coding-system-spec");
3913   staticpro (&Qcoding_system_spec);
3914
3915   Qcoding_system_p = intern ("coding-system-p");
3916   staticpro (&Qcoding_system_p);
3917
3918   Qcoding_system_error = intern ("coding-system-error");
3919   staticpro (&Qcoding_system_error);
3920
3921   Fput (Qcoding_system_error, Qerror_conditions,
3922         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3923   Fput (Qcoding_system_error, Qerror_message,
3924         build_string ("Invalid coding system"));
3925
3926   Qcoding_category_index = intern ("coding-category-index");
3927   staticpro (&Qcoding_category_index);
3928
3929   {
3930     int i;
3931     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3932       {
3933         coding_category_table[i] = intern (coding_category_name[i]);
3934         staticpro (&coding_category_table[i]);
3935         Fput (coding_category_table[i], Qcoding_category_index,
3936               make_number (i));
3937       }
3938   }
3939
3940   Qcharacter_unification_table = intern ("character-unification-table");
3941   staticpro (&Qcharacter_unification_table);
3942   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3943         make_number (0));
3944
3945   Qcharacter_unification_table_for_decode
3946     = intern ("character-unification-table-for-decode");
3947   staticpro (&Qcharacter_unification_table_for_decode);
3948
3949   Qcharacter_unification_table_for_encode
3950     = intern ("character-unification-table-for-encode");
3951   staticpro (&Qcharacter_unification_table_for_encode);
3952
3953   Qemacs_mule = intern ("emacs-mule");
3954   staticpro (&Qemacs_mule);
3955
3956   defsubr (&Scoding_system_spec);
3957   defsubr (&Scoding_system_p);
3958   defsubr (&Sread_coding_system);
3959   defsubr (&Sread_non_nil_coding_system);
3960   defsubr (&Scheck_coding_system);
3961   defsubr (&Sdetect_coding_region);
3962   defsubr (&Sdecode_coding_region);
3963   defsubr (&Sencode_coding_region);
3964   defsubr (&Sdecode_coding_string);
3965   defsubr (&Sencode_coding_string);
3966   defsubr (&Sdecode_sjis_char);
3967   defsubr (&Sencode_sjis_char);
3968   defsubr (&Sdecode_big5_char);
3969   defsubr (&Sencode_big5_char);
3970   defsubr (&Sset_terminal_coding_system_internal);
3971   defsubr (&Sset_safe_terminal_coding_system_internal);
3972   defsubr (&Sterminal_coding_system);
3973   defsubr (&Sset_keyboard_coding_system_internal);
3974   defsubr (&Skeyboard_coding_system);
3975   defsubr (&Sfind_operation_coding_system);
3976
3977   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3978     "List of coding-categories (symbols) ordered by priority.");
3979   {
3980     int i;
3981
3982     Vcoding_category_list = Qnil;
3983     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3984       Vcoding_category_list
3985         = Fcons (coding_category_table[i], Vcoding_category_list);
3986   }
3987
3988   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3989     "Specify the coding system for read operations.\n\
3990 It is useful to bind this variable with `let', but do not set it globally.\n\
3991 If the value is a coding system, it is used for decoding on read operation.\n\
3992 If not, an appropriate element is used from one of the coding system alists:\n\
3993 There are three such tables, `file-coding-system-alist',\n\
3994 `process-coding-system-alist', and `network-coding-system-alist'.");
3995   Vcoding_system_for_read = Qnil;
3996
3997   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3998     "Specify the coding system for write operations.\n\
3999 It is useful to bind this variable with `let', but do not set it globally.\n\
4000 If the value is a coding system, it is used for encoding on write operation.\n\
4001 If not, an appropriate element is used from one of the coding system alists:\n\
4002 There are three such tables, `file-coding-system-alist',\n\
4003 `process-coding-system-alist', and `network-coding-system-alist'.");
4004   Vcoding_system_for_write = Qnil;
4005
4006   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4007     "Coding system used in the latest file or process I/O.");
4008   Vlast_coding_system_used = Qnil;
4009
4010   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4011     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4012   inhibit_eol_conversion = 0;
4013
4014   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4015     "Alist to decide a coding system to use for a file I/O operation.\n\
4016 The format is ((PATTERN . VAL) ...),\n\
4017 where PATTERN is a regular expression matching a file name,\n\
4018 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4019 If VAL is a coding system, it is used for both decoding and encoding\n\
4020 the file contents.\n\
4021 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4022 and the cdr part is used for encoding.\n\
4023 If VAL is a function symbol, the function must return a coding system\n\
4024 or a cons of coding systems which are used as above.\n\
4025 \n\
4026 See also the function `find-operation-coding-system'.");
4027   Vfile_coding_system_alist = Qnil;
4028
4029   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4030     "Alist to decide a coding system to use for a process I/O operation.\n\
4031 The format is ((PATTERN . VAL) ...),\n\
4032 where PATTERN is a regular expression matching a program name,\n\
4033 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4034 If VAL is a coding system, it is used for both decoding what received\n\
4035 from the program and encoding what sent to the program.\n\
4036 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4037 and the cdr part is used for encoding.\n\
4038 If VAL is a function symbol, the function must return a coding system\n\
4039 or a cons of coding systems which are used as above.\n\
4040 \n\
4041 See also the function `find-operation-coding-system'.");
4042   Vprocess_coding_system_alist = Qnil;
4043
4044   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4045     "Alist to decide a coding system to use for a network I/O operation.\n\
4046 The format is ((PATTERN . VAL) ...),\n\
4047 where PATTERN is a regular expression matching a network service name\n\
4048 or is a port number to connect to,\n\
4049 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4050 If VAL is a coding system, it is used for both decoding what received\n\
4051 from the network stream and encoding what sent to the network stream.\n\
4052 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4053 and the cdr part is used for encoding.\n\
4054 If VAL is a function symbol, the function must return a coding system\n\
4055 or a cons of coding systems which are used as above.\n\
4056 \n\
4057 See also the function `find-operation-coding-system'.");
4058   Vnetwork_coding_system_alist = Qnil;
4059
4060   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4061     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4062   eol_mnemonic_unix = ':';
4063
4064   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4065     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4066   eol_mnemonic_dos = '\\';
4067
4068   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4069     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4070   eol_mnemonic_mac = '/';
4071
4072   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4073     "Mnemonic character indicating end-of-line format is not yet decided.");
4074   eol_mnemonic_undecided = ':';
4075
4076   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4077     "Non-nil means ISO 2022 encoder/decoder do character unification.");
4078   Venable_character_unification = Qt;
4079
4080   DEFVAR_LISP ("standard-character-unification-table-for-decode",
4081     &Vstandard_character_unification_table_for_decode,
4082     "Table for unifying characters when reading.");
4083   Vstandard_character_unification_table_for_decode = Qnil;
4084
4085   DEFVAR_LISP ("standard-character-unification-table-for-encode",
4086     &Vstandard_character_unification_table_for_encode,
4087     "Table for unifying characters when writing.");
4088   Vstandard_character_unification_table_for_encode = Qnil;
4089
4090   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4091     "Alist of charsets vs revision numbers.\n\
4092 While encoding, if a charset (car part of an element) is found,\n\
4093 designate it with the escape sequence identifing revision (cdr part of the element).");
4094   Vcharset_revision_alist = Qnil;
4095
4096   DEFVAR_LISP ("default-process-coding-system",
4097                &Vdefault_process_coding_system,
4098     "Cons of coding systems used for process I/O by default.\n\
4099 The car part is used for decoding a process output,\n\
4100 the cdr part is used for encoding a text to be sent to a process.");
4101   Vdefault_process_coding_system = Qnil;
4102
4103   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4104     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4105 This is a vector of length 256.\n\
4106 If Nth element is non-nil, the existence of code N in a file\n\
4107 (or output of subprocess) doesn't prevent it to be detected as\n\
4108 a coding system of ISO 2022 variant which has a flag\n\
4109 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4110 or reading output of a subprocess.\n\
4111 Only 128th through 159th elements has a meaning.");
4112   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4113 }
4114
4115 #endif /* emacs */