src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Other
  71
  72   If a user wants to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represents a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See section 6 for more details.
  81
  82 */
  83
  84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  85
  86   How end-of-line of a text is encoded depends on a system.  For
  87   instance, Unix's format is just one byte of `line-feed' code,
  88   whereas DOS's format is two-byte sequence of `carriage-return' and
  89   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  90
  91   Since text characters encoding and end-of-line encoding are
  92   independent, any coding system described above can take
  93   any format of end-of-line.  So, Emacs has information of format of
  94   end-of-line in each coding-system.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
  99
 100   These functions check if a text between SRC and SRC_END is encoded
 101   in the coding system category XXX.  Each returns an integer value in
 102   which appropriate flag bits for the category XXX is set.  The flag
 103   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 104   template of these functions.  */
 105 #if 0
 106 int
 107 detect_coding_emacs_mule (src, src_end)
 108      unsigned char *src, *src_end;
 109 {
 110   ...
 111 }
 112 #endif
 113
 114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 115
 116   These functions decode SRC_BYTES length text at SOURCE encoded in
 117   CODING to Emacs' internal format (emacs-mule).  The resulting text
 118   goes to a place pointed to by DESTINATION, the length of which should
 119   not exceed DST_BYTES.  The number of bytes actually processed is
 120   returned as *CONSUMED.  The return value is the length of the decoded
 121   text.  Below is a template of these functions.  */
 122 #if 0
 123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 124      struct coding_system *coding;
 125      unsigned char *source, *destination;
 126      int src_bytes, dst_bytes;
 127      int *consumed;
 128 {
 129   ...
 130 }
 131 #endif
 132
 133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 134
 135   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 136   internal format (emacs-mule) to CODING.  The resulting text goes to
 137   a place pointed to by DESTINATION, the length of which should not
 138   exceed DST_BYTES.  The number of bytes actually processed is
 139   returned as *CONSUMED.  The return value is the length of the
 140   encoded text.  Below is a template of these functions.  */
 141 #if 0
 142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 143      struct coding_system *coding;
 144      unsigned char *source, *destination;
 145      int src_bytes, dst_bytes;
 146      int *consumed;
 147 {
 148   ...
 149 }
 150 #endif
 151
 152 /*** COMMONLY USED MACROS ***/
 153
 154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 155    THREE_MORE_BYTES safely get one, two, and three bytes from the
 156    source text respectively.  If there are not enough bytes in the
 157    source, they jump to `label_end_of_loop'.  The caller should set
 158    variables `src' and `src_end' to appropriate areas in advance.  */
 159
 160 #define ONE_MORE_BYTE(c1)       \
 161   do {                          \
 162     if (src < src_end)          \
 163       c1 = *src++;              \
 164     else                        \
 165       goto label_end_of_loop;   \
 166   } while (0)
 167
 168 #define TWO_MORE_BYTES(c1, c2)  \
 169   do {                          \
 170     if (src + 1 < src_end)      \
 171       c1 = *src++, c2 = *src++; \
 172     else                        \
 173       goto label_end_of_loop;   \
 174   } while (0)
 175
 176 #define THREE_MORE_BYTES(c1, c2, c3)            \
 177   do {                                          \
 178     if (src + 2 < src_end)                      \
 179       c1 = *src++, c2 = *src++, c3 = *src++;    \
 180     else                                        \
 181       goto label_end_of_loop;                   \
 182   } while (0)
 183
 184 /* The following three macros DECODE_CHARACTER_ASCII,
 185    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 186    the multi-byte form of a character of each class at the place
 187    pointed by `dst'.  The caller should set the variable `dst' to
 188    point to an appropriate area and the variable `coding' to point to
 189    the coding-system of the currently decoding text in advance.  */
 190
 191 /* Decode one ASCII character C.  */
 192
 193 #define DECODE_CHARACTER_ASCII(c)                               \
 194   do {                                                          \
 195     if (COMPOSING_P (coding->composing))                        \
 196       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 197     else                                                        \
 198       *dst++ = (c);                                             \
 199   } while (0)
 200
 201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 202    position-code is C.  */
 203
 204 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 205   do {                                                                  \
 206     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 207     if (COMPOSING_P (coding->composing))                                \
 208       *dst++ = leading_code + 0x20;                                     \
 209     else                                                                \
 210       *dst++ = leading_code;                                            \
 211     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 212       *dst++ = leading_code;                                            \
 213     *dst++ = (c) | 0x80;                                                \
 214   } while (0)
 215
 216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 217    position-codes are C1 and C2.  */
 218
 219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 220   do {                                                  \
 221     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 222     *dst++ = (c2) | 0x80;                               \
 223   } while (0)
 224
 225 \f
 226 /*** 1. Preamble ***/
 227
 228 #include <stdio.h>
 229
 230 #ifdef emacs
 231
 232 #include <config.h>
 233 #include "lisp.h"
 234 #include "buffer.h"
 235 #include "charset.h"
 236 #include "ccl.h"
 237 #include "coding.h"
 238 #include "window.h"
 239
 240 #else  /* not emacs */
 241
 242 #include "mulelib.h"
 243
 244 #endif /* not emacs */
 245
 246 Lisp_Object Qcoding_system, Qeol_type;
 247 Lisp_Object Qbuffer_file_coding_system;
 248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 249
 250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 252 Lisp_Object Qstart_process, Qopen_network_stream;
 253 Lisp_Object Qtarget_idx;
 254
 255 /* Mnemonic character of each format of end-of-line.  */
 256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 257 /* Mnemonic character to indicate format of end-of-line is not yet
 258    decided.  */
 259 int eol_mnemonic_undecided;
 260
 261 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 262    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 263 int system_eol_type;
 264
 265 #ifdef emacs
 266
 267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 268
 269 /* Coding system emacs-mule is for converting only end-of-line format.  */
 270 Lisp_Object Qemacs_mule;
 271
 272 /* Coding-systems are handed between Emacs Lisp programs and C internal
 273    routines by the following three variables.  */
 274 /* Coding-system for reading files and receiving data from process.  */
 275 Lisp_Object Vcoding_system_for_read;
 276 /* Coding-system for writing files and sending data to process.  */
 277 Lisp_Object Vcoding_system_for_write;
 278 /* Coding-system actually used in the latest I/O.  */
 279 Lisp_Object Vlast_coding_system_used;
 280
 281 /* Flag to inhibit code conversion of end-of-line format.  */
 282 int inhibit_eol_conversion;
 283
 284 /* Coding-system of what terminal accept for displaying.  */
 285 struct coding_system terminal_coding;
 286
 287 /* Coding-system of what is sent from terminal keyboard.  */
 288 struct coding_system keyboard_coding;
 289
 290 Lisp_Object Vfile_coding_system_alist;
 291 Lisp_Object Vprocess_coding_system_alist;
 292 Lisp_Object Vnetwork_coding_system_alist;
 293
 294 #endif /* emacs */
 295
 296 Lisp_Object Qcoding_category_index;
 297
 298 /* List of symbols `coding-category-xxx' ordered by priority.  */
 299 Lisp_Object Vcoding_category_list;
 300
 301 /* Table of coding-systems currently assigned to each coding-category.  */
 302 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 303
 304 /* Table of names of symbol for each coding-category.  */
 305 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 306   "coding-category-emacs-mule",
 307   "coding-category-sjis",
 308   "coding-category-iso-7",
 309   "coding-category-iso-8-1",
 310   "coding-category-iso-8-2",
 311   "coding-category-iso-7-else",
 312   "coding-category-iso-8-else",
 313   "coding-category-big5",
 314   "coding-category-binary"
 315 };
 316
 317 /* Flag to tell if we look up unification table on character code
 318    conversion.  */
 319 Lisp_Object Venable_character_unification;
 320 /* Standard unification table to look up on decoding (reading).  */
 321 Lisp_Object Vstandard_character_unification_table_for_decode;
 322 /* Standard unification table to look up on encoding (writing).  */
 323 Lisp_Object Vstandard_character_unification_table_for_encode;
 324
 325 Lisp_Object Qcharacter_unification_table;
 326 Lisp_Object Qcharacter_unification_table_for_decode;
 327 Lisp_Object Qcharacter_unification_table_for_encode;
 328
 329 /* Alist of charsets vs revision number.  */
 330 Lisp_Object Vcharset_revision_alist;
 331
 332 /* Default coding systems used for process I/O.  */
 333 Lisp_Object Vdefault_process_coding_system;
 334
 335 \f
 336 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 337
 338 /* Emacs' internal format for encoding multiple character sets is a
 339    kind of multi-byte encoding, i.e. characters are encoded by
 340    variable-length sequences of one-byte codes.  ASCII characters
 341    and control characters (e.g. `tab', `newline') are represented by
 342    one-byte sequences which are their ASCII codes, in the range 0x00
 343    through 0x7F.  The other characters are represented by a sequence
 344    of `base leading-code', optional `extended leading-code', and one
 345    or two `position-code's.  The length of the sequence is determined
 346    by the base leading-code.  Leading-code takes the range 0x80
 347    through 0x9F, whereas extended leading-code and position-code take
 348    the range 0xA0 through 0xFF.  See `charset.h' for more details
 349    about leading-code and position-code.
 350
 351    There's one exception to this rule.  Special leading-code
 352    `leading-code-composition' denotes that the following several
 353    characters should be composed into one character.  Leading-codes of
 354    components (except for ASCII) are added 0x20.  An ASCII character
 355    component is represented by a 2-byte sequence of `0xA0' and
 356    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 357    details of composite character.  Hence, we can summarize the code
 358    range as follows:
 359
 360    --- CODE RANGE of Emacs' internal format ---
 361    (character set)      (range)
 362    ASCII                0x00 .. 0x7F
 363    ELSE (1st byte)      0x80 .. 0x9F
 364         (rest bytes)    0xA0 .. 0xFF
 365    ---------------------------------------------
 366
 367   */
 368
 369 enum emacs_code_class_type emacs_code_class[256];
 370
 371 /* Go to the next statement only if *SRC is accessible and the code is
 372    greater than 0xA0.  */
 373 #define CHECK_CODE_RANGE_A0_FF  \
 374   do {                          \
 375     if (src >= src_end)         \
 376       goto label_end_of_switch; \
 377     else if (*src++ < 0xA0)     \
 378       return 0;                 \
 379   } while (0)
 380
 381 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 382    Check if a text is encoded in Emacs' internal format.  If it is,
 383    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 384
 385 int
 386 detect_coding_emacs_mule (src, src_end)
 387      unsigned char *src, *src_end;
 388 {
 389   unsigned char c;
 390   int composing = 0;
 391
 392   while (src < src_end)
 393     {
 394       c = *src++;
 395
 396       if (composing)
 397         {
 398           if (c < 0xA0)
 399             composing = 0;
 400           else
 401             c -= 0x20;
 402         }
 403
 404       switch (emacs_code_class[c])
 405         {
 406         case EMACS_ascii_code:
 407         case EMACS_linefeed_code:
 408           break;
 409
 410         case EMACS_control_code:
 411           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 412             return 0;
 413           break;
 414
 415         case EMACS_invalid_code:
 416           return 0;
 417
 418         case EMACS_leading_code_composition: /* c == 0x80 */
 419           if (composing)
 420             CHECK_CODE_RANGE_A0_FF;
 421           else
 422             composing = 1;
 423           break;
 424
 425         case EMACS_leading_code_4:
 426           CHECK_CODE_RANGE_A0_FF;
 427           /* fall down to check it two more times ...  */
 428
 429         case EMACS_leading_code_3:
 430           CHECK_CODE_RANGE_A0_FF;
 431           /* fall down to check it one more time ...  */
 432
 433         case EMACS_leading_code_2:
 434           CHECK_CODE_RANGE_A0_FF;
 435           break;
 436
 437         default:
 438         label_end_of_switch:
 439           break;
 440         }
 441     }
 442   return CODING_CATEGORY_MASK_EMACS_MULE;
 443 }
 444
 445 \f
 446 /*** 3. ISO2022 handlers ***/
 447
 448 /* The following note describes the coding system ISO2022 briefly.
 449    Since the intention of this note is to help in understanding of
 450    the programs in this file, some parts are NOT ACCURATE or OVERLY
 451    SIMPLIFIED.  For the thorough understanding, please refer to the
 452    original document of ISO2022.
 453
 454    ISO2022 provides many mechanisms to encode several character sets
 455    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 456    all text is encoded by codes of less than 128.  This may make the
 457    encoded text a little bit longer, but the text gets more stability
 458    to pass through several gateways (some of them strip off the MSB).
 459
 460    There are two kinds of character set: control character set and
 461    graphic character set.  The former contains control characters such
 462    as `newline' and `escape' to provide control functions (control
 463    functions are provided also by escape sequences).  The latter
 464    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 465    two control character sets and many graphic character sets.
 466
 467    Graphic character sets are classified into one of the following
 468    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 469    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 470    bytes (DIMENSION) and the number of characters in one dimension
 471    (CHARS) of the set.  In addition, each character set is assigned an
 472    identification tag (called "final character" and denoted as <F>
 473    here after) which is unique in each class.  <F> of each character
 474    set is decided by ECMA(*) when it is registered in ISO.  Code range
 475    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 476
 477    Note (*): ECMA = European Computer Manufacturers Association
 478
 479    Here are examples of graphic character set [NAME(<F>)]:
 480         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 481         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 482         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 483         o DIMENSION2_CHARS96 -- none for the moment
 484
 485    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 486         C0 [0x00..0x1F] -- control character plane 0
 487         GL [0x20..0x7F] -- graphic character plane 0
 488         C1 [0x80..0x9F] -- control character plane 1
 489         GR [0xA0..0xFF] -- graphic character plane 1
 490
 491    A control character set is directly designated and invoked to C0 or
 492    C1 by an escape sequence.  The most common case is that ISO646's
 493    control character set is designated/invoked to C0 and ISO6429's
 494    control character set is designated/invoked to C1, and usually
 495    these designations/invocations are omitted in a coded text.  With
 496    7-bit environment, only C0 can be used, and a control character for
 497    C1 is encoded by an appropriate escape sequence to fit in the
 498    environment.  All control characters for C1 are defined the
 499    corresponding escape sequences.
 500
 501    A graphic character set is at first designated to one of four
 502    graphic registers (G0 through G3), then these graphic registers are
 503    invoked to GL or GR.  These designations and invocations can be
 504    done independently.  The most common case is that G0 is invoked to
 505    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 506    these invocations and designations are omitted in a coded text.
 507    With 7-bit environment, only GL can be used.
 508
 509    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 510    and 0x7F of GL area work as control characters SPACE and DEL
 511    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 512
 513    There are two ways of invocation: locking-shift and single-shift.
 514    With locking-shift, the invocation lasts until the next different
 515    invocation, whereas with single-shift, the invocation works only
 516    for the following character and doesn't affect locking-shift.
 517    Invocations are done by the following control characters or escape
 518    sequences.
 519
 520    ----------------------------------------------------------------------
 521    function             control char    escape sequence description
 522    ----------------------------------------------------------------------
 523    SI  (shift-in)               0x0F    none            invoke G0 to GL
 524    SO  (shift-out)              0x0E    none            invoke G1 to GL
 525    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 526    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 527    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 528    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 529    ----------------------------------------------------------------------
 530    The first four are for locking-shift.  Control characters for these
 531    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 532
 533    Designations are done by the following escape sequences.
 534    ----------------------------------------------------------------------
 535    escape sequence      description
 536    ----------------------------------------------------------------------
 537    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 538    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 539    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 540    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 541    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 542    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 543    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 544    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 545    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 546    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 547    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 548    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 549    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 550    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 551    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 552    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 553    ----------------------------------------------------------------------
 554
 555    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 556    of dimension 1, chars 94, and final character <F>, and etc.
 557
 558    Note (*): Although these designations are not allowed in ISO2022,
 559    Emacs accepts them on decoding, and produces them on encoding
 560    CHARS96 character set in a coding system which is characterized as
 561    7-bit environment, non-locking-shift, and non-single-shift.
 562
 563    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 564    '(' can be omitted.  We call this as "short-form" here after.
 565
 566    Now you may notice that there are a lot of ways for encoding the
 567    same multilingual text in ISO2022.  Actually, there exists many
 568    coding systems such as Compound Text (used in X's inter client
 569    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 570    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 571    localized platforms), and all of these are variants of ISO2022.
 572
 573    In addition to the above, Emacs handles two more kinds of escape
 574    sequences: ISO6429's direction specification and Emacs' private
 575    sequence for specifying character composition.
 576
 577    ISO6429's direction specification takes the following format:
 578         o CSI ']'      -- end of the current direction
 579         o CSI '0' ']'  -- end of the current direction
 580         o CSI '1' ']'  -- start of left-to-right text
 581         o CSI '2' ']'  -- start of right-to-left text
 582    The control character CSI (0x9B: control sequence introducer) is
 583    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 584
 585    Character composition specification takes the following format:
 586         o ESC '0' -- start character composition
 587         o ESC '1' -- end character composition
 588    Since these are not standard escape sequences of any ISO, the use
 589    of them for these meaning is restricted to Emacs only.  */
 590
 591 enum iso_code_class_type iso_code_class[256];
 592
 593 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 594    Check if a text is encoded in ISO2022.  If it is, returns an
 595    integer in which appropriate flag bits any of:
 596         CODING_CATEGORY_MASK_ISO_7
 597         CODING_CATEGORY_MASK_ISO_8_1
 598         CODING_CATEGORY_MASK_ISO_8_2
 599         CODING_CATEGORY_MASK_ISO_7_ELSE
 600         CODING_CATEGORY_MASK_ISO_8_ELSE
 601    are set.  If a code which should never appear in ISO2022 is found,
 602    returns 0.  */
 603
 604 int
 605 detect_coding_iso2022 (src, src_end)
 606      unsigned char *src, *src_end;
 607 {
 608   int mask = (CODING_CATEGORY_MASK_ISO_7
 609               | CODING_CATEGORY_MASK_ISO_8_1
 610               | CODING_CATEGORY_MASK_ISO_8_2
 611               | CODING_CATEGORY_MASK_ISO_7_ELSE
 612               | CODING_CATEGORY_MASK_ISO_8_ELSE
 613               );
 614   int g1 = 0;                   /* 1 iff designating to G1.  */
 615   int c, i;
 616
 617   while (src < src_end)
 618     {
 619       c = *src++;
 620       switch (c)
 621         {
 622         case ISO_CODE_ESC:
 623           if (src >= src_end)
 624             break;
 625           c = *src++;
 626           if ((c >= '(' && c <= '/'))
 627             {
 628               /* Designation sequence for a charset of dimension 1.  */
 629               if (src >= src_end)
 630                 break;
 631               c = *src++;
 632               if (c < ' ' || c >= 0x80)
 633                 /* Invalid designation sequence.  */
 634                 return 0;
 635             }
 636           else if (c == '$')
 637             {
 638               /* Designation sequence for a charset of dimension 2.  */
 639               if (src >= src_end)
 640                 break;
 641               c = *src++;
 642               if (c >= '@' && c <= 'B')
 643                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 644                 ;
 645               else if (c >= '(' && c <= '/')
 646                 {
 647                   if (src >= src_end)
 648                     break;
 649                   c = *src++;
 650                   if (c < ' ' || c >= 0x80)
 651                     /* Invalid designation sequence.  */
 652                     return 0;
 653                 }
 654               else
 655                 /* Invalid designation sequence.  */
 656                 return 0;
 657             }
 658           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 659             /* Locking shift.  */
 660             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 661                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 662           else if (c == '0' || c == '1' || c == '2')
 663             /* Start/end composition.  */
 664             ;
 665           else
 666             /* Invalid escape sequence.  */
 667             return 0;
 668           break;
 669
 670         case ISO_CODE_SO:
 671           mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 672                    | CODING_CATEGORY_MASK_ISO_8_ELSE);
 673           break;
 674
 675         case ISO_CODE_CSI:
 676         case ISO_CODE_SS2:
 677         case ISO_CODE_SS3:
 678           return CODING_CATEGORY_MASK_ISO_8_ELSE;
 679
 680         default:
 681           if (c < 0x80)
 682             break;
 683           else if (c < 0xA0)
 684             return 0;
 685           else
 686             {
 687               unsigned char *src_begin = src;
 688
 689               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 690                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 691               while (src < src_end && *src >= 0xA0)
 692                 src++;
 693               if ((src - src_begin - 1) & 1 && src < src_end)
 694                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 695             }
 696           break;
 697         }
 698     }
 699
 700   return mask;
 701 }
 702
 703 /* Decode a character of which charset is CHARSET and the 1st position
 704    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 705    fetched from SRC and set to C2.  If CHARSET is negative, it means
 706    that we are decoding ill formed text, and what we can do is just to
 707    read C1 as is.  */
 708
 709 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 710   do {                                                                  \
 711     int c_alt, charset_alt = (charset);                                 \
 712     if (COMPOSING_HEAD_P (coding->composing))                           \
 713       {                                                                 \
 714         *dst++ = LEADING_CODE_COMPOSITION;                              \
 715         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 716           /* To tell composition rules are embeded.  */                 \
 717           *dst++ = 0xFF;                                                \
 718         coding->composing += 2;                                         \
 719       }                                                                 \
 720     if ((charset) >= 0)                                                 \
 721       {                                                                 \
 722         if (CHARSET_DIMENSION (charset) == 2)                           \
 723           ONE_MORE_BYTE (c2);                                           \
 724         if (!NILP (unification_table)                                   \
 725             && ((c_alt = unify_char (unification_table,                 \
 726                                      -1, (charset), c1, c2)) >= 0))     \
 727           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 728       }                                                                 \
 729     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 730       DECODE_CHARACTER_ASCII (c1);                                      \
 731     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 732       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 733     else                                                                \
 734       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 735     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 736       /* To tell a composition rule follows.  */                        \
 737       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 738   } while (0)
 739
 740 /* Set designation state into CODING.  */
 741 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 742   do {                                                                  \
 743     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 744                                      make_number (chars),               \
 745                                      make_number (final_char));         \
 746     if (charset >= 0)                                                   \
 747       {                                                                 \
 748         if (coding->direction == 1                                      \
 749             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 750           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 751         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 752       }                                                                 \
 753   } while (0)
 754
 755 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 756
 757 int
 758 decode_coding_iso2022 (coding, source, destination,
 759                        src_bytes, dst_bytes, consumed)
 760      struct coding_system *coding;
 761      unsigned char *source, *destination;
 762      int src_bytes, dst_bytes;
 763      int *consumed;
 764 {
 765   unsigned char *src = source;
 766   unsigned char *src_end = source + src_bytes;
 767   unsigned char *dst = destination;
 768   unsigned char *dst_end = destination + dst_bytes;
 769   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 770      from DST_END to assure that overflow checking is necessary only
 771      at the head of loop.  */
 772   unsigned char *adjusted_dst_end = dst_end - 6;
 773   int charset;
 774   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 775   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 776   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 777   Lisp_Object unification_table
 778       = coding->character_unification_table_for_decode;
 779
 780   if (!NILP (Venable_character_unification) && NILP (unification_table))
 781     unification_table = Vstandard_character_unification_table_for_decode;
 782
 783   while (src < src_end && dst < adjusted_dst_end)
 784     {
 785       /* SRC_BASE remembers the start position in source in each loop.
 786          The loop will be exited when there's not enough source text
 787          to analyze long escape sequence or 2-byte code (within macros
 788          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 789          to SRC_BASE before exiting.  */
 790       unsigned char *src_base = src;
 791       int c1 = *src++, c2;
 792
 793       switch (iso_code_class [c1])
 794         {
 795         case ISO_0x20_or_0x7F:
 796           if (!coding->composing
 797               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 798             {
 799               /* This is SPACE or DEL.  */
 800               *dst++ = c1;
 801               break;
 802             }
 803           /* This is a graphic character, we fall down ...  */
 804
 805         case ISO_graphic_plane_0:
 806           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 807             {
 808               /* This is a composition rule.  */
 809               *dst++ = c1 | 0x80;
 810               coding->composing = COMPOSING_WITH_RULE_TAIL;
 811             }
 812           else
 813             DECODE_ISO_CHARACTER (charset0, c1);
 814           break;
 815
 816         case ISO_0xA0_or_0xFF:
 817           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 818             {
 819               /* Invalid code.  */
 820               *dst++ = c1;
 821               break;
 822             }
 823           /* This is a graphic character, we fall down ... */
 824
 825         case ISO_graphic_plane_1:
 826           DECODE_ISO_CHARACTER (charset1, c1);
 827           break;
 828
 829         case ISO_control_code:
 830           /* All ISO2022 control characters in this class have the
 831              same representation in Emacs internal format.  */
 832           *dst++ = c1;
 833           break;
 834
 835         case ISO_carriage_return:
 836           if (coding->eol_type == CODING_EOL_CR)
 837             {
 838               *dst++ = '\n';
 839             }
 840           else if (coding->eol_type == CODING_EOL_CRLF)
 841             {
 842               ONE_MORE_BYTE (c1);
 843               if (c1 == ISO_CODE_LF)
 844                 *dst++ = '\n';
 845               else
 846                 {
 847                   src--;
 848                   *dst++ = c1;
 849                 }
 850             }
 851           else
 852             {
 853               *dst++ = c1;
 854             }
 855           break;
 856
 857         case ISO_shift_out:
 858           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 859             goto label_invalid_escape_sequence;
 860           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 861           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 862           break;
 863
 864         case ISO_shift_in:
 865           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 866           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 867           break;
 868
 869         case ISO_single_shift_2_7:
 870         case ISO_single_shift_2:
 871           /* SS2 is handled as an escape sequence of ESC 'N' */
 872           c1 = 'N';
 873           goto label_escape_sequence;
 874
 875         case ISO_single_shift_3:
 876           /* SS2 is handled as an escape sequence of ESC 'O' */
 877           c1 = 'O';
 878           goto label_escape_sequence;
 879
 880         case ISO_control_sequence_introducer:
 881           /* CSI is handled as an escape sequence of ESC '[' ...  */
 882           c1 = '[';
 883           goto label_escape_sequence;
 884
 885         case ISO_escape:
 886           ONE_MORE_BYTE (c1);
 887         label_escape_sequence:
 888           /* Escape sequences handled by Emacs are invocation,
 889              designation, direction specification, and character
 890              composition specification.  */
 891           switch (c1)
 892             {
 893             case '&':           /* revision of following character set */
 894               ONE_MORE_BYTE (c1);
 895               if (!(c1 >= '@' && c1 <= '~'))
 896                 goto label_invalid_escape_sequence;
 897               ONE_MORE_BYTE (c1);
 898               if (c1 != ISO_CODE_ESC)
 899                 goto label_invalid_escape_sequence;
 900               ONE_MORE_BYTE (c1);
 901               goto label_escape_sequence;
 902
 903             case '$':           /* designation of 2-byte character set */
 904               ONE_MORE_BYTE (c1);
 905               if (c1 >= '@' && c1 <= 'B')
 906                 {       /* designation of JISX0208.1978, GB2312.1980,
 907                                    or JISX0208.1980 */
 908                   DECODE_DESIGNATION (0, 2, 94, c1);
 909                 }
 910               else if (c1 >= 0x28 && c1 <= 0x2B)
 911                 {       /* designation of DIMENSION2_CHARS94 character set */
 912                   ONE_MORE_BYTE (c2);
 913                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 914                 }
 915               else if (c1 >= 0x2C && c1 <= 0x2F)
 916                 {       /* designation of DIMENSION2_CHARS96 character set */
 917                   ONE_MORE_BYTE (c2);
 918                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 919                 }
 920               else
 921                 goto label_invalid_escape_sequence;
 922               break;
 923
 924             case 'n':           /* invocation of locking-shift-2 */
 925               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 926                 goto label_invalid_escape_sequence;
 927               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 928               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 929               break;
 930
 931             case 'o':           /* invocation of locking-shift-3 */
 932               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 933                 goto label_invalid_escape_sequence;
 934               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 935               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 936               break;
 937
 938             case 'N':           /* invocation of single-shift-2 */
 939               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 940                 goto label_invalid_escape_sequence;
 941               ONE_MORE_BYTE (c1);
 942               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 943               DECODE_ISO_CHARACTER (charset, c1);
 944               break;
 945
 946             case 'O':           /* invocation of single-shift-3 */
 947               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 948                 goto label_invalid_escape_sequence;
 949               ONE_MORE_BYTE (c1);
 950               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 951               DECODE_ISO_CHARACTER (charset, c1);
 952               break;
 953
 954             case '0':           /* start composing without embeded rules */
 955               coding->composing = COMPOSING_NO_RULE_HEAD;
 956               break;
 957
 958             case '1':           /* end composing */
 959               coding->composing = COMPOSING_NO;
 960               break;
 961
 962             case '2':           /* start composing with embeded rules */
 963               coding->composing = COMPOSING_WITH_RULE_HEAD;
 964               break;
 965
 966             case '[':           /* specification of direction */
 967               /* For the moment, nested direction is not supported.
 968                  So, the value of `coding->direction' is 0 or 1: 0
 969                  means left-to-right, 1 means right-to-left.  */
 970               ONE_MORE_BYTE (c1);
 971               switch (c1)
 972                 {
 973                 case ']':       /* end of the current direction */
 974                   coding->direction = 0;
 975
 976                 case '0':       /* end of the current direction */
 977                 case '1':       /* start of left-to-right direction */
 978                   ONE_MORE_BYTE (c1);
 979                   if (c1 == ']')
 980                     coding->direction = 0;
 981                   else
 982                     goto label_invalid_escape_sequence;
 983                   break;
 984
 985                 case '2':       /* start of right-to-left direction */
 986                   ONE_MORE_BYTE (c1);
 987                   if (c1 == ']')
 988                     coding->direction= 1;
 989                   else
 990                     goto label_invalid_escape_sequence;
 991                   break;
 992
 993                 default:
 994                   goto label_invalid_escape_sequence;
 995                 }
 996               break;
 997
 998             default:
 999               if (c1 >= 0x28 && c1 <= 0x2B)
1000                 {       /* designation of DIMENSION1_CHARS94 character set */
1001                   ONE_MORE_BYTE (c2);
1002                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1003                 }
1004               else if (c1 >= 0x2C && c1 <= 0x2F)
1005                 {       /* designation of DIMENSION1_CHARS96 character set */
1006                   ONE_MORE_BYTE (c2);
1007                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1008                 }
1009               else
1010                 {
1011                   goto label_invalid_escape_sequence;
1012                 }
1013             }
1014           /* We must update these variables now.  */
1015           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1016           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1017           break;
1018
1019         label_invalid_escape_sequence:
1020           {
1021             int length = src - src_base;
1022
1023             bcopy (src_base, dst, length);
1024             dst += length;
1025           }
1026         }
1027       continue;
1028
1029     label_end_of_loop:
1030       coding->carryover_size = src - src_base;
1031       bcopy (src_base, coding->carryover, coding->carryover_size);
1032       src = src_base;
1033       break;
1034     }
1035
1036   /* If this is the last block of the text to be decoded, we had
1037      better just flush out all remaining codes in the text although
1038      they are not valid characters.  */
1039   if (coding->last_block)
1040     {
1041       bcopy (src, dst, src_end - src);
1042       dst += (src_end - src);
1043       src = src_end;
1044     }
1045   *consumed = src - source;
1046   return dst - destination;
1047 }
1048
1049 /* ISO2022 encoding stuff.  */
1050
1051 /*
1052    It is not enough to say just "ISO2022" on encoding, we have to
1053    specify more details.  In Emacs, each coding-system of ISO2022
1054    variant has the following specifications:
1055         1. Initial designation to G0 thru G3.
1056         2. Allows short-form designation?
1057         3. ASCII should be designated to G0 before control characters?
1058         4. ASCII should be designated to G0 at end of line?
1059         5. 7-bit environment or 8-bit environment?
1060         6. Use locking-shift?
1061         7. Use Single-shift?
1062    And the following two are only for Japanese:
1063         8. Use ASCII in place of JIS0201-1976-Roman?
1064         9. Use JISX0208-1983 in place of JISX0208-1978?
1065    These specifications are encoded in `coding->flags' as flag bits
1066    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1067    details.
1068 */
1069
1070 /* Produce codes (escape sequence) for designating CHARSET to graphic
1071    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1072    the coding system CODING allows, produce designation sequence of
1073    short-form.  */
1074
1075 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1076   do {                                                                  \
1077     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1078     char *intermediate_char_94 = "()*+";                                \
1079     char *intermediate_char_96 = ",-./";                                \
1080     Lisp_Object temp                                                    \
1081       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1082     if (! NILP (temp))                                                  \
1083         {                                                               \
1084         *dst++ = ISO_CODE_ESC;                                          \
1085         *dst++ = '&';                                                   \
1086         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1087       }                                                                 \
1088     *dst++ = ISO_CODE_ESC;                                              \
1089     if (CHARSET_DIMENSION (charset) == 1)                               \
1090       {                                                                 \
1091         if (CHARSET_CHARS (charset) == 94)                              \
1092           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1093         else                                                            \
1094           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1095       }                                                                 \
1096     else                                                                \
1097       {                                                                 \
1098         *dst++ = '$';                                                   \
1099         if (CHARSET_CHARS (charset) == 94)                              \
1100           {                                                             \
1101             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1102                 || reg != 0                                             \
1103                 || final_char < '@' || final_char > 'B')                \
1104               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1105           }                                                             \
1106         else                                                            \
1107           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1108       }                                                                 \
1109     *dst++ = final_char;                                                \
1110     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1111   } while (0)
1112
1113 /* The following two macros produce codes (control character or escape
1114    sequence) for ISO2022 single-shift functions (single-shift-2 and
1115    single-shift-3).  */
1116
1117 #define ENCODE_SINGLE_SHIFT_2                           \
1118   do {                                                  \
1119     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1120       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1121     else                                                \
1122       *dst++ = ISO_CODE_SS2;                            \
1123     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1124   } while (0)
1125
1126 #define ENCODE_SINGLE_SHIFT_3                           \
1127   do {                                                  \
1128     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1129       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1130     else                                                \
1131       *dst++ = ISO_CODE_SS3;                            \
1132     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1133   } while (0)
1134
1135 /* The following four macros produce codes (control character or
1136    escape sequence) for ISO2022 locking-shift functions (shift-in,
1137    shift-out, locking-shift-2, and locking-shift-3).  */
1138
1139 #define ENCODE_SHIFT_IN                         \
1140   do {                                          \
1141     *dst++ = ISO_CODE_SI;                       \
1142     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1143   } while (0)
1144
1145 #define ENCODE_SHIFT_OUT                        \
1146   do {                                          \
1147     *dst++ = ISO_CODE_SO;                       \
1148     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1149   } while (0)
1150
1151 #define ENCODE_LOCKING_SHIFT_2                  \
1152   do {                                          \
1153     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1154     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1155   } while (0)
1156
1157 #define ENCODE_LOCKING_SHIFT_3                  \
1158   do {                                          \
1159     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1160     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1161   } while (0)
1162
1163 /* Produce codes for a DIMENSION1 character whose character set is
1164    CHARSET and whose position-code is C1.  Designation and invocation
1165    sequences are also produced in advance if necessary.  */
1166
1167
1168 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1169   do {                                                                  \
1170     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1171       {                                                                 \
1172         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1173           *dst++ = c1 & 0x7F;                                           \
1174         else                                                            \
1175           *dst++ = c1 | 0x80;                                           \
1176         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1177         break;                                                          \
1178       }                                                                 \
1179     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1180       {                                                                 \
1181         *dst++ = c1 & 0x7F;                                             \
1182         break;                                                          \
1183       }                                                                 \
1184     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1185       {                                                                 \
1186         *dst++ = c1 | 0x80;                                             \
1187         break;                                                          \
1188       }                                                                 \
1189     else                                                                \
1190       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1191          must invoke it, or, at first, designate it to some graphic     \
1192          register.  Then repeat the loop to actually produce the        \
1193          character.  */                                                 \
1194       dst = encode_invocation_designation (charset, coding, dst);       \
1195   } while (1)
1196
1197 /* Produce codes for a DIMENSION2 character whose character set is
1198    CHARSET and whose position-codes are C1 and C2.  Designation and
1199    invocation codes are also produced in advance if necessary.  */
1200
1201 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1202   do {                                                                  \
1203     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1204       {                                                                 \
1205         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1206           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1207         else                                                            \
1208           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1209         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1210         break;                                                          \
1211       }                                                                 \
1212     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1213       {                                                                 \
1214         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1215         break;                                                          \
1216       }                                                                 \
1217     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1218       {                                                                 \
1219         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1220         break;                                                          \
1221       }                                                                 \
1222     else                                                                \
1223       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1224          must invoke it, or, at first, designate it to some graphic     \
1225          register.  Then repeat the loop to actually produce the        \
1226          character.  */                                                 \
1227       dst = encode_invocation_designation (charset, coding, dst);       \
1228   } while (1)
1229
1230 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1231   do {                                                                    \
1232     int c_alt, charset_alt;                                               \
1233     if (!NILP (unification_table)                                         \
1234         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1235             >= 0))                                                        \
1236       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1237     else                                                                  \
1238       charset_alt = charset;                                              \
1239     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1240       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1241     else                                                                  \
1242       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1243   } while (0)
1244
1245 /* Produce designation and invocation codes at a place pointed by DST
1246    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1247    Return new DST.  */
1248
1249 unsigned char *
1250 encode_invocation_designation (charset, coding, dst)
1251      int charset;
1252      struct coding_system *coding;
1253      unsigned char *dst;
1254 {
1255   int reg;                      /* graphic register number */
1256
1257   /* At first, check designations.  */
1258   for (reg = 0; reg < 4; reg++)
1259     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1260       break;
1261
1262   if (reg >= 4)
1263     {
1264       /* CHARSET is not yet designated to any graphic registers.  */
1265       /* At first check the requested designation.  */
1266       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1267       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1268         /* Since CHARSET requests no special designation, designate it
1269            to graphic register 0.  */
1270         reg = 0;
1271
1272       ENCODE_DESIGNATION (charset, reg, coding);
1273     }
1274
1275   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1276       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1277     {
1278       /* Since the graphic register REG is not invoked to any graphic
1279          planes, invoke it to graphic plane 0.  */
1280       switch (reg)
1281         {
1282         case 0:                 /* graphic register 0 */
1283           ENCODE_SHIFT_IN;
1284           break;
1285
1286         case 1:                 /* graphic register 1 */
1287           ENCODE_SHIFT_OUT;
1288           break;
1289
1290         case 2:                 /* graphic register 2 */
1291           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1292             ENCODE_SINGLE_SHIFT_2;
1293           else
1294             ENCODE_LOCKING_SHIFT_2;
1295           break;
1296
1297         case 3:                 /* graphic register 3 */
1298           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1299             ENCODE_SINGLE_SHIFT_3;
1300           else
1301             ENCODE_LOCKING_SHIFT_3;
1302           break;
1303         }
1304     }
1305   return dst;
1306 }
1307
1308 /* The following two macros produce codes for indicating composition.  */
1309 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1310 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1311 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1312
1313 /* The following three macros produce codes for indicating direction
1314    of text.  */
1315 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1316   do {                                                  \
1317     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1318       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1319     else                                                \
1320       *dst++ = ISO_CODE_CSI;                            \
1321   } while (0)
1322
1323 #define ENCODE_DIRECTION_R2L    \
1324   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1325
1326 #define ENCODE_DIRECTION_L2R    \
1327   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1328
1329 /* Produce codes for designation and invocation to reset the graphic
1330    planes and registers to initial state.  */
1331 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1332   do {                                                                      \
1333     int reg;                                                                \
1334     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1335       ENCODE_SHIFT_IN;                                                      \
1336     for (reg = 0; reg < 4; reg++)                                           \
1337       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1338           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1339               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1340         ENCODE_DESIGNATION                                                  \
1341           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1342   } while (0)
1343
1344 /* Produce designation sequences of charsets in the line started from
1345    *SRC to a place pointed by DSTP.
1346
1347    If the current block ends before any end-of-line, we may fail to
1348    find all the necessary *designations.  */
1349 encode_designation_at_bol (coding, table, src, src_end, dstp)
1350      struct coding_system *coding;
1351      Lisp_Object table;
1352      unsigned char *src, *src_end, **dstp;
1353 {
1354   int charset, c, found = 0, reg;
1355   /* Table of charsets to be designated to each graphic register.  */
1356   int r[4];
1357   unsigned char *dst = *dstp;
1358
1359   for (reg = 0; reg < 4; reg++)
1360     r[reg] = -1;
1361
1362   while (src < src_end && *src != '\n' && found < 4)
1363     {
1364       int bytes = BYTES_BY_CHAR_HEAD (*src);
1365
1366       if (NILP (table))
1367         charset = CHARSET_AT (src);
1368       else
1369         {
1370           int c_alt, c1, c2;
1371
1372           SPLIT_STRING(src, bytes, charset, c1, c2);
1373           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1374             charset = CHAR_CHARSET (c_alt);
1375         }
1376
1377       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1378       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1379         {
1380           found++;
1381           r[reg] = charset;
1382         }
1383
1384       src += bytes;
1385     }
1386
1387   if (found)
1388     {
1389       for (reg = 0; reg < 4; reg++)
1390         if (r[reg] >= 0
1391             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1392           ENCODE_DESIGNATION (r[reg], reg, coding);
1393       *dstp = dst;
1394     }
1395 }
1396
1397 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1398
1399 int
1400 encode_coding_iso2022 (coding, source, destination,
1401                        src_bytes, dst_bytes, consumed)
1402      struct coding_system *coding;
1403      unsigned char *source, *destination;
1404      int src_bytes, dst_bytes;
1405      int *consumed;
1406 {
1407   unsigned char *src = source;
1408   unsigned char *src_end = source + src_bytes;
1409   unsigned char *dst = destination;
1410   unsigned char *dst_end = destination + dst_bytes;
1411   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1412      from DST_END to assure overflow checking is necessary only at the
1413      head of loop.  */
1414   unsigned char *adjusted_dst_end = dst_end - 19;
1415   Lisp_Object unification_table
1416       = coding->character_unification_table_for_encode;
1417
1418   if (!NILP (Venable_character_unification) && NILP (unification_table))
1419     unification_table = Vstandard_character_unification_table_for_encode;
1420
1421   while (src < src_end && dst < adjusted_dst_end)
1422     {
1423       /* SRC_BASE remembers the start position in source in each loop.
1424          The loop will be exited when there's not enough source text
1425          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1426          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1427          reset to SRC_BASE before exiting.  */
1428       unsigned char *src_base = src;
1429       int charset, c1, c2, c3, c4;
1430
1431       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1432           && CODING_SPEC_ISO_BOL (coding))
1433         {
1434           /* We have to produce designation sequences if any now.  */
1435           encode_designation_at_bol (coding, unification_table,
1436                                      src, src_end, &dst);
1437           CODING_SPEC_ISO_BOL (coding) = 0;
1438         }
1439
1440       c1 = *src++;
1441       /* If we are seeing a component of a composite character, we are
1442          seeing a leading-code specially encoded for composition, or a
1443          composition rule if composing with rule.  We must set C1
1444          to a normal leading-code or an ASCII code.  If we are not at
1445          a composed character, we must reset the composition state.  */
1446       if (COMPOSING_P (coding->composing))
1447         {
1448           if (c1 < 0xA0)
1449             {
1450               /* We are not in a composite character any longer.  */
1451               coding->composing = COMPOSING_NO;
1452               ENCODE_COMPOSITION_END;
1453             }
1454           else
1455             {
1456               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1457                 {
1458                   *dst++ = c1 & 0x7F;
1459                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1460                   continue;
1461                 }
1462               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1463                 coding->composing = COMPOSING_WITH_RULE_RULE;
1464               if (c1 == 0xA0)
1465                 {
1466                   /* This is an ASCII component.  */
1467                   ONE_MORE_BYTE (c1);
1468                   c1 &= 0x7F;
1469                 }
1470               else
1471                 /* This is a leading-code of non ASCII component.  */
1472                 c1 -= 0x20;
1473             }
1474         }
1475
1476       /* Now encode one character.  C1 is a control character, an
1477          ASCII character, or a leading-code of multi-byte character.  */
1478       switch (emacs_code_class[c1])
1479         {
1480         case EMACS_ascii_code:
1481           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1482           break;
1483
1484         case EMACS_control_code:
1485           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1486             ENCODE_RESET_PLANE_AND_REGISTER;
1487           *dst++ = c1;
1488           break;
1489
1490         case EMACS_carriage_return_code:
1491           if (!coding->selective)
1492             {
1493               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1494                 ENCODE_RESET_PLANE_AND_REGISTER;
1495               *dst++ = c1;
1496               break;
1497             }
1498           /* fall down to treat '\r' as '\n' ...  */
1499
1500         case EMACS_linefeed_code:
1501           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1502             ENCODE_RESET_PLANE_AND_REGISTER;
1503           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1504             bcopy (coding->spec.iso2022.initial_designation,
1505                    coding->spec.iso2022.current_designation,
1506                    sizeof coding->spec.iso2022.initial_designation);
1507           if (coding->eol_type == CODING_EOL_LF
1508               || coding->eol_type == CODING_EOL_UNDECIDED)
1509             *dst++ = ISO_CODE_LF;
1510           else if (coding->eol_type == CODING_EOL_CRLF)
1511             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1512           else
1513             *dst++ = ISO_CODE_CR;
1514           CODING_SPEC_ISO_BOL (coding) = 1;
1515           break;
1516
1517         case EMACS_leading_code_2:
1518           ONE_MORE_BYTE (c2);
1519           if (c2 < 0xA0)
1520             {
1521               /* invalid sequence */
1522               *dst++ = c1;
1523               *dst++ = c2;
1524             }
1525           else
1526             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1527           break;
1528
1529         case EMACS_leading_code_3:
1530           TWO_MORE_BYTES (c2, c3);
1531           if (c2 < 0xA0 || c3 < 0xA0)
1532             {
1533               /* invalid sequence */
1534               *dst++ = c1;
1535               *dst++ = c2;
1536               *dst++ = c3;
1537             }
1538           else if (c1 < LEADING_CODE_PRIVATE_11)
1539             ENCODE_ISO_CHARACTER (c1, c2, c3);
1540           else
1541             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1542           break;
1543
1544         case EMACS_leading_code_4:
1545           THREE_MORE_BYTES (c2, c3, c4);
1546           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1547             {
1548               /* invalid sequence */
1549               *dst++ = c1;
1550               *dst++ = c2;
1551               *dst++ = c3;
1552               *dst++ = c4;
1553             }
1554           else
1555             ENCODE_ISO_CHARACTER (c2, c3, c4);
1556           break;
1557
1558         case EMACS_leading_code_composition:
1559           ONE_MORE_BYTE (c2);
1560           if (c2 < 0xA0)
1561             {
1562               /* invalid sequence */
1563               *dst++ = c1;
1564               *dst++ = c2;
1565             }
1566           else if (c2 == 0xFF)
1567             {
1568               coding->composing = COMPOSING_WITH_RULE_HEAD;
1569               ENCODE_COMPOSITION_WITH_RULE_START;
1570             }
1571           else
1572             {
1573               /* Rewind one byte because it is a character code of
1574                  composition elements.  */
1575               src--;
1576               coding->composing = COMPOSING_NO_RULE_HEAD;
1577               ENCODE_COMPOSITION_NO_RULE_START;
1578             }
1579           break;
1580
1581         case EMACS_invalid_code:
1582           *dst++ = c1;
1583           break;
1584         }
1585       continue;
1586     label_end_of_loop:
1587       /* We reach here because the source date ends not at character
1588          boundary.  */
1589       coding->carryover_size = src_end - src_base;
1590       bcopy (src_base, coding->carryover, coding->carryover_size);
1591       src = src_end;
1592       break;
1593     }
1594
1595   /* If this is the last block of the text to be encoded, we must
1596      reset graphic planes and registers to the initial state.  */
1597   if (src >= src_end && coding->last_block)
1598     {
1599       ENCODE_RESET_PLANE_AND_REGISTER;
1600       if (coding->carryover_size > 0
1601           && coding->carryover_size < (dst_end - dst))
1602         {
1603           bcopy (coding->carryover, dst, coding->carryover_size);
1604           dst += coding->carryover_size;
1605           coding->carryover_size = 0;
1606         }
1607     }
1608   *consumed = src - source;
1609   return dst - destination;
1610 }
1611
1612 \f
1613 /*** 4. SJIS and BIG5 handlers ***/
1614
1615 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1616    quite widely.  So, for the moment, Emacs supports them in the bare
1617    C code.  But, in the future, they may be supported only by CCL.  */
1618
1619 /* SJIS is a coding system encoding three character sets: ASCII, right
1620    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1621    as is.  A character of charset katakana-jisx0201 is encoded by
1622    "position-code + 0x80".  A character of charset japanese-jisx0208
1623    is encoded in 2-byte but two position-codes are divided and shifted
1624    so that it fit in the range below.
1625
1626    --- CODE RANGE of SJIS ---
1627    (character set)      (range)
1628    ASCII                0x00 .. 0x7F
1629    KATAKANA-JISX0201    0xA0 .. 0xDF
1630    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1631             (2nd byte)  0x40 .. 0xFF
1632    -------------------------------
1633
1634 */
1635
1636 /* BIG5 is a coding system encoding two character sets: ASCII and
1637    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1638    character set and is encoded in two-byte.
1639
1640    --- CODE RANGE of BIG5 ---
1641    (character set)      (range)
1642    ASCII                0x00 .. 0x7F
1643    Big5 (1st byte)      0xA1 .. 0xFE
1644         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1645    --------------------------
1646
1647    Since the number of characters in Big5 is larger than maximum
1648    characters in Emacs' charset (96x96), it can't be handled as one
1649    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1650    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1651    contains frequently used characters and the latter contains less
1652    frequently used characters.  */
1653
1654 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1655    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1656    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1657    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1658
1659 /* Number of Big5 characters which have the same code in 1st byte.  */
1660 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1661
1662 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1663   do {                                                                  \
1664     unsigned int temp                                                   \
1665       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1666     if (b1 < 0xC9)                                                      \
1667       charset = charset_big5_1;                                         \
1668     else                                                                \
1669       {                                                                 \
1670         charset = charset_big5_2;                                       \
1671         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1672       }                                                                 \
1673     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1674     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1675   } while (0)
1676
1677 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1678   do {                                                                  \
1679     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1680     if (charset == charset_big5_2)                                      \
1681       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1682     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1683     b2 = temp % BIG5_SAME_ROW;                                          \
1684     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1685   } while (0)
1686
1687 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1688   do {                                                                  \
1689     int c_alt, charset_alt = (charset);                                 \
1690     if (!NILP (unification_table)                                       \
1691         && ((c_alt = unify_char (unification_table,                     \
1692                                  -1, (charset), c1, c2)) >= 0))         \
1693           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1694     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1695       DECODE_CHARACTER_ASCII (c1);                                      \
1696     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1697       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1698     else                                                                \
1699       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1700   } while (0)
1701
1702 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1703   do {                                                                    \
1704     int c_alt, charset_alt;                                               \
1705     if (!NILP (unification_table)                                         \
1706         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1707             >= 0))                                                        \
1708       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1709     else                                                                  \
1710       charset_alt = charset;                                              \
1711     if (charset_alt == charset_ascii)                                     \
1712       *dst++ = c1;                                                        \
1713     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1714       {                                                                   \
1715         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1716           *dst++ = c1;                                                    \
1717         else                                                              \
1718           *dst++ = charset_alt, *dst++ = c1;                              \
1719       }                                                                   \
1720     else                                                                  \
1721       {                                                                   \
1722         c1 &= 0x7F, c2 &= 0x7F;                                           \
1723         if (sjis_p && charset_alt == charset_jisx0208)                    \
1724           {                                                               \
1725             unsigned char s1, s2;                                         \
1726                                                                           \
1727             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1728             *dst++ = s1, *dst++ = s2;                                     \
1729           }                                                               \
1730         else if (!sjis_p                                                  \
1731                  && (charset_alt == charset_big5_1                        \
1732                      || charset_alt == charset_big5_2))                   \
1733           {                                                               \
1734             unsigned char b1, b2;                                         \
1735                                                                           \
1736             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1737             *dst++ = b1, *dst++ = b2;                                     \
1738           }                                                               \
1739         else                                                              \
1740           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1741       }                                                                   \
1742   } while (0);
1743
1744 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1745    Check if a text is encoded in SJIS.  If it is, return
1746    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1747
1748 int
1749 detect_coding_sjis (src, src_end)
1750      unsigned char *src, *src_end;
1751 {
1752   unsigned char c;
1753
1754   while (src < src_end)
1755     {
1756       c = *src++;
1757       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1758         return 0;
1759       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1760         {
1761           if (src < src_end && *src++ < 0x40)
1762             return 0;
1763         }
1764     }
1765   return CODING_CATEGORY_MASK_SJIS;
1766 }
1767
1768 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1769    Check if a text is encoded in BIG5.  If it is, return
1770    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1771
1772 int
1773 detect_coding_big5 (src, src_end)
1774      unsigned char *src, *src_end;
1775 {
1776   unsigned char c;
1777
1778   while (src < src_end)
1779     {
1780       c = *src++;
1781       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1782         return 0;
1783       if (c >= 0xA1)
1784         {
1785           if (src >= src_end)
1786             break;
1787           c = *src++;
1788           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1789             return 0;
1790         }
1791     }
1792   return CODING_CATEGORY_MASK_BIG5;
1793 }
1794
1795 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1796    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1797
1798 int
1799 decode_coding_sjis_big5 (coding, source, destination,
1800                          src_bytes, dst_bytes, consumed, sjis_p)
1801      struct coding_system *coding;
1802      unsigned char *source, *destination;
1803      int src_bytes, dst_bytes;
1804      int *consumed;
1805      int sjis_p;
1806 {
1807   unsigned char *src = source;
1808   unsigned char *src_end = source + src_bytes;
1809   unsigned char *dst = destination;
1810   unsigned char *dst_end = destination + dst_bytes;
1811   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1812      from DST_END to assure overflow checking is necessary only at the
1813      head of loop.  */
1814   unsigned char *adjusted_dst_end = dst_end - 3;
1815   Lisp_Object unification_table
1816       = coding->character_unification_table_for_decode;
1817
1818   if (!NILP (Venable_character_unification) && NILP (unification_table))
1819     unification_table = Vstandard_character_unification_table_for_decode;
1820
1821   while (src < src_end && dst < adjusted_dst_end)
1822     {
1823       /* SRC_BASE remembers the start position in source in each loop.
1824          The loop will be exited when there's not enough source text
1825          to analyze two-byte character (within macro ONE_MORE_BYTE).
1826          In that case, SRC is reset to SRC_BASE before exiting.  */
1827       unsigned char *src_base = src;
1828       unsigned char c1 = *src++, c2, c3, c4;
1829
1830       if (c1 == '\r')
1831         {
1832           if (coding->eol_type == CODING_EOL_CRLF)
1833             {
1834               ONE_MORE_BYTE (c2);
1835               if (c2 == '\n')
1836                 *dst++ = c2;
1837               else
1838                 /* To process C2 again, SRC is subtracted by 1.  */
1839                 *dst++ = c1, src--;
1840             }
1841           else
1842             *dst++ = c1;
1843         }
1844       else if (c1 < 0x20)
1845         *dst++ = c1;
1846       else if (c1 < 0x80)
1847         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1848       else if (c1 < 0xA0 || c1 >= 0xE0)
1849         {
1850           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1851           if (sjis_p)
1852             {
1853               ONE_MORE_BYTE (c2);
1854               DECODE_SJIS (c1, c2, c3, c4);
1855               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1856             }
1857           else if (c1 >= 0xE0 && c1 < 0xFF)
1858             {
1859               int charset;
1860
1861               ONE_MORE_BYTE (c2);
1862               DECODE_BIG5 (c1, c2, charset, c3, c4);
1863               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1864             }
1865           else                  /* Invalid code */
1866             *dst++ = c1;
1867         }
1868       else
1869         {
1870           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1871           if (sjis_p)
1872             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1873           else
1874             {
1875               int charset;
1876
1877               ONE_MORE_BYTE (c2);
1878               DECODE_BIG5 (c1, c2, charset, c3, c4);
1879               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1880             }
1881         }
1882       continue;
1883
1884     label_end_of_loop:
1885       coding->carryover_size = src - src_base;
1886       bcopy (src_base, coding->carryover, coding->carryover_size);
1887       src = src_base;
1888       break;
1889     }
1890
1891   *consumed = src - source;
1892   return dst - destination;
1893 }
1894
1895 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1896    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1897    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1898    sure that all these charsets are registered as official charset
1899    (i.e. do not have extended leading-codes).  Characters of other
1900    charsets are produced without any encoding.  If SJIS_P is 1, encode
1901    SJIS text, else encode BIG5 text.  */
1902
1903 int
1904 encode_coding_sjis_big5 (coding, source, destination,
1905                          src_bytes, dst_bytes, consumed, sjis_p)
1906      struct coding_system *coding;
1907      unsigned char *source, *destination;
1908      int src_bytes, dst_bytes;
1909      int *consumed;
1910      int sjis_p;
1911 {
1912   unsigned char *src = source;
1913   unsigned char *src_end = source + src_bytes;
1914   unsigned char *dst = destination;
1915   unsigned char *dst_end = destination + dst_bytes;
1916   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1917      from DST_END to assure overflow checking is necessary only at the
1918      head of loop.  */
1919   unsigned char *adjusted_dst_end = dst_end - 1;
1920   Lisp_Object unification_table
1921       = coding->character_unification_table_for_encode;
1922
1923   if (!NILP (Venable_character_unification) && NILP (unification_table))
1924     unification_table = Vstandard_character_unification_table_for_encode;
1925
1926   while (src < src_end && dst < adjusted_dst_end)
1927     {
1928       /* SRC_BASE remembers the start position in source in each loop.
1929          The loop will be exited when there's not enough source text
1930          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1931          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1932          before exiting.  */
1933       unsigned char *src_base = src;
1934       unsigned char c1 = *src++, c2, c3, c4;
1935
1936       if (coding->composing)
1937         {
1938           if (c1 == 0xA0)
1939             {
1940               ONE_MORE_BYTE (c1);
1941               c1 &= 0x7F;
1942             }
1943           else if (c1 >= 0xA0)
1944             c1 -= 0x20;
1945           else
1946             coding->composing = 0;
1947         }
1948
1949       switch (emacs_code_class[c1])
1950         {
1951         case EMACS_ascii_code:
1952           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1953           break;
1954
1955         case EMACS_control_code:
1956           *dst++ = c1;
1957           break;
1958
1959         case EMACS_carriage_return_code:
1960           if (!coding->selective)
1961             {
1962               *dst++ = c1;
1963               break;
1964             }
1965           /* fall down to treat '\r' as '\n' ...  */
1966
1967         case EMACS_linefeed_code:
1968           if (coding->eol_type == CODING_EOL_LF
1969               || coding->eol_type == CODING_EOL_UNDECIDED)
1970             *dst++ = '\n';
1971           else if (coding->eol_type == CODING_EOL_CRLF)
1972             *dst++ = '\r', *dst++ = '\n';
1973           else
1974             *dst++ = '\r';
1975           break;
1976
1977         case EMACS_leading_code_2:
1978           ONE_MORE_BYTE (c2);
1979           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
1980           break;
1981
1982         case EMACS_leading_code_3:
1983           TWO_MORE_BYTES (c2, c3);
1984           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
1985           break;
1986
1987         case EMACS_leading_code_4:
1988           THREE_MORE_BYTES (c2, c3, c4);
1989           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
1990           break;
1991
1992         case EMACS_leading_code_composition:
1993           coding->composing = 1;
1994           break;
1995
1996         default:                /* i.e. case EMACS_invalid_code: */
1997           *dst++ = c1;
1998         }
1999       continue;
2000
2001     label_end_of_loop:
2002       coding->carryover_size = src_end - src_base;
2003       bcopy (src_base, coding->carryover, coding->carryover_size);
2004       src = src_end;
2005       break;
2006     }
2007
2008   *consumed = src - source;
2009   return dst - destination;
2010 }
2011
2012 \f
2013 /*** 5. End-of-line handlers ***/
2014
2015 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2016    This function is called only when `coding->eol_type' is
2017    CODING_EOL_CRLF or CODING_EOL_CR.  */
2018
2019 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2020      struct coding_system *coding;
2021      unsigned char *source, *destination;
2022      int src_bytes, dst_bytes;
2023      int *consumed;
2024 {
2025   unsigned char *src = source;
2026   unsigned char *src_end = source + src_bytes;
2027   unsigned char *dst = destination;
2028   unsigned char *dst_end = destination + dst_bytes;
2029   int produced;
2030
2031   switch (coding->eol_type)
2032     {
2033     case CODING_EOL_CRLF:
2034       {
2035         /* Since the maximum bytes produced by each loop is 2, we
2036            subtract 1 from DST_END to assure overflow checking is
2037            necessary only at the head of loop.  */
2038         unsigned char *adjusted_dst_end = dst_end - 1;
2039
2040         while (src < src_end && dst < adjusted_dst_end)
2041           {
2042             unsigned char *src_base = src;
2043             unsigned char c = *src++;
2044             if (c == '\r')
2045               {
2046                 ONE_MORE_BYTE (c);
2047                 if (c != '\n')
2048                   *dst++ = '\r';
2049                 *dst++ = c;
2050               }
2051             else
2052               *dst++ = c;
2053             continue;
2054
2055           label_end_of_loop:
2056             coding->carryover_size = src - src_base;
2057             bcopy (src_base, coding->carryover, coding->carryover_size);
2058             src = src_base;
2059             break;
2060           }
2061         *consumed = src - source;
2062         produced = dst - destination;
2063         break;
2064       }
2065
2066     case CODING_EOL_CR:
2067       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2068       bcopy (source, destination, produced);
2069       dst_end = destination + produced;
2070       while (dst < dst_end)
2071         if (*dst++ == '\r') dst[-1] = '\n';
2072       *consumed = produced;
2073       break;
2074
2075     default:                    /* i.e. case: CODING_EOL_LF */
2076       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2077       bcopy (source, destination, produced);
2078       *consumed = produced;
2079       break;
2080     }
2081
2082   return produced;
2083 }
2084
2085 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2086    format of end-of-line according to `coding->eol_type'.  If
2087    `coding->selective' is 1, code '\r' in source text also means
2088    end-of-line.  */
2089
2090 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2091      struct coding_system *coding;
2092      unsigned char *source, *destination;
2093      int src_bytes, dst_bytes;
2094      int *consumed;
2095 {
2096   unsigned char *src = source;
2097   unsigned char *dst = destination;
2098   int produced;
2099
2100   if (src_bytes <= 0)
2101     return 0;
2102
2103   switch (coding->eol_type)
2104     {
2105     case CODING_EOL_LF:
2106     case CODING_EOL_UNDECIDED:
2107       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2108       bcopy (source, destination, produced);
2109       if (coding->selective)
2110         {
2111           int i = produced;
2112           while (i--)
2113             if (*dst++ == '\r') dst[-1] = '\n';
2114         }
2115       *consumed = produced;
2116
2117     case CODING_EOL_CRLF:
2118       {
2119         unsigned char c;
2120         unsigned char *src_end = source + src_bytes;
2121         unsigned char *dst_end = destination + dst_bytes;
2122         /* Since the maximum bytes produced by each loop is 2, we
2123            subtract 1 from DST_END to assure overflow checking is
2124            necessary only at the head of loop.  */
2125         unsigned char *adjusted_dst_end = dst_end - 1;
2126
2127         while (src < src_end && dst < adjusted_dst_end)
2128           {
2129             c = *src++;
2130             if (c == '\n' || (c == '\r' && coding->selective))
2131               *dst++ = '\r', *dst++ = '\n';
2132             else
2133               *dst++ = c;
2134           }
2135         produced = dst - destination;
2136         *consumed = src - source;
2137         break;
2138       }
2139
2140     default:                    /* i.e. case CODING_EOL_CR: */
2141       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2142       bcopy (source, destination, produced);
2143       {
2144         int i = produced;
2145         while (i--)
2146           if (*dst++ == '\n') dst[-1] = '\r';
2147       }
2148       *consumed = produced;
2149     }
2150
2151   return produced;
2152 }
2153
2154 \f
2155 /*** 6. C library functions ***/
2156
2157 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2158    has a property `coding-system'.  The value of this property is a
2159    vector of length 5 (called as coding-vector).  Among elements of
2160    this vector, the first (element[0]) and the fifth (element[4])
2161    carry important information for decoding/encoding.  Before
2162    decoding/encoding, this information should be set in fields of a
2163    structure of type `coding_system'.
2164
2165    A value of property `coding-system' can be a symbol of another
2166    subsidiary coding-system.  In that case, Emacs gets coding-vector
2167    from that symbol.
2168
2169    `element[0]' contains information to be set in `coding->type'.  The
2170    value and its meaning is as follows:
2171
2172    0 -- coding_type_emacs_mule
2173    1 -- coding_type_sjis
2174    2 -- coding_type_iso2022
2175    3 -- coding_type_big5
2176    4 -- coding_type_ccl encoder/decoder written in CCL
2177    nil -- coding_type_no_conversion
2178    t -- coding_type_undecided (automatic conversion on decoding,
2179                                no-conversion on encoding)
2180
2181    `element[4]' contains information to be set in `coding->flags' and
2182    `coding->spec'.  The meaning varies by `coding->type'.
2183
2184    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2185    of length 32 (of which the first 13 sub-elements are used now).
2186    Meanings of these sub-elements are:
2187
2188    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2189         If the value is an integer of valid charset, the charset is
2190         assumed to be designated to graphic register N initially.
2191
2192         If the value is minus, it is a minus value of charset which
2193         reserves graphic register N, which means that the charset is
2194         not designated initially but should be designated to graphic
2195         register N just before encoding a character in that charset.
2196
2197         If the value is nil, graphic register N is never used on
2198         encoding.
2199
2200    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2201         Each value takes t or nil.  See the section ISO2022 of
2202         `coding.h' for more information.
2203
2204    If `coding->type' is `coding_type_big5', element[4] is t to denote
2205    BIG5-ETen or nil to denote BIG5-HKU.
2206
2207    If `coding->type' takes the other value, element[4] is ignored.
2208
2209    Emacs Lisp's coding system also carries information about format of
2210    end-of-line in a value of property `eol-type'.  If the value is
2211    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2212    means CODING_EOL_CR.  If it is not integer, it should be a vector
2213    of subsidiary coding systems of which property `eol-type' has one
2214    of above values.
2215
2216 */
2217
2218 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2219    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2220    is setup so that no conversion is necessary and return -1, else
2221    return 0.  */
2222
2223 int
2224 setup_coding_system (coding_system, coding)
2225      Lisp_Object coding_system;
2226      struct coding_system *coding;
2227 {
2228   Lisp_Object type, eol_type;
2229
2230   /* At first, set several fields to default values.  */
2231   coding->require_flushing = 0;
2232   coding->last_block = 0;
2233   coding->selective = 0;
2234   coding->composing = 0;
2235   coding->direction = 0;
2236   coding->carryover_size = 0;
2237   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2238   coding->character_unification_table_for_decode = Qnil;
2239   coding->character_unification_table_for_encode = Qnil;
2240
2241   Vlast_coding_system_used = coding->symbol = coding_system;
2242   eol_type = Qnil;
2243   /* Get value of property `coding-system' until we get a vector.
2244      While doing that, also get values of properties
2245      `post-read-conversion', `pre-write-conversion',
2246      `character-unification-table-for-decode',
2247      `character-unification-table-for-encode' and `eol-type'.  */
2248   while (!NILP (coding_system) && SYMBOLP (coding_system))
2249     {
2250       if (NILP (coding->post_read_conversion))
2251         coding->post_read_conversion = Fget (coding_system,
2252                                              Qpost_read_conversion);
2253       if (NILP (coding->pre_write_conversion))
2254         coding->pre_write_conversion = Fget (coding_system,
2255                                              Qpre_write_conversion);
2256       if (!inhibit_eol_conversion && NILP (eol_type))
2257         eol_type = Fget (coding_system, Qeol_type);
2258
2259       if (NILP (coding->character_unification_table_for_decode))
2260         coding->character_unification_table_for_decode
2261           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2262
2263       if (NILP (coding->character_unification_table_for_encode))
2264         coding->character_unification_table_for_encode
2265           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2266
2267       coding_system = Fget (coding_system, Qcoding_system);
2268     }
2269
2270   while (!NILP (coding->character_unification_table_for_decode)
2271          && SYMBOLP (coding->character_unification_table_for_decode))
2272         coding->character_unification_table_for_decode
2273           = Fget (coding->character_unification_table_for_decode,
2274                   Qcharacter_unification_table_for_decode);
2275   if (!NILP (coding->character_unification_table_for_decode)
2276       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2277       coding->character_unification_table_for_decode = Qnil;
2278
2279   while (!NILP (coding->character_unification_table_for_encode)
2280          && SYMBOLP (coding->character_unification_table_for_encode))
2281         coding->character_unification_table_for_encode
2282           = Fget (coding->character_unification_table_for_encode,
2283                   Qcharacter_unification_table_for_encode);
2284   if (!NILP (coding->character_unification_table_for_encode)
2285       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2286       coding->character_unification_table_for_encode = Qnil;
2287
2288   if (!VECTORP (coding_system)
2289       || XVECTOR (coding_system)->size != 5)
2290     goto label_invalid_coding_system;
2291
2292   if (VECTORP (eol_type))
2293     coding->eol_type = CODING_EOL_UNDECIDED;
2294   else if (XFASTINT (eol_type) == 1)
2295     coding->eol_type = CODING_EOL_CRLF;
2296   else if (XFASTINT (eol_type) == 2)
2297     coding->eol_type = CODING_EOL_CR;
2298   else
2299     coding->eol_type = CODING_EOL_LF;
2300
2301   type = XVECTOR (coding_system)->contents[0];
2302   switch (XFASTINT (type))
2303     {
2304     case 0:
2305       coding->type = coding_type_emacs_mule;
2306       break;
2307
2308     case 1:
2309       coding->type = coding_type_sjis;
2310       break;
2311
2312     case 2:
2313       coding->type = coding_type_iso2022;
2314       {
2315         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2316         Lisp_Object *flags;
2317         int i, charset, default_reg_bits = 0;
2318
2319         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2320           goto label_invalid_coding_system;
2321
2322         flags = XVECTOR (val)->contents;
2323         coding->flags
2324           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2325              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2326              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2327              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2328              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2329              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2330              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2331              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2332              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2333              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2334              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2335
2336         /* Invoke graphic register 0 to plane 0.  */
2337         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2338         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2339         CODING_SPEC_ISO_INVOCATION (coding, 1)
2340           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2341         /* Not single shifting at first.  */
2342         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2343         /* Beginning of buffer should also be regarded as bol. */
2344         CODING_SPEC_ISO_BOL(coding) = 1;
2345
2346         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2347            FLAGS[REG] can be one of below:
2348                 integer CHARSET: CHARSET occupies register I,
2349                 t: designate nothing to REG initially, but can be used
2350                   by any charsets,
2351                 list of integer, nil, or t: designate the first
2352                   element (if integer) to REG initially, the remaining
2353                   elements (if integer) is designated to REG on request,
2354                   if an element is t, REG can be used by any charset,
2355                 nil: REG is never used.  */
2356         for (charset = 0; charset <= MAX_CHARSET; charset++)
2357           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2358             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2359         for (i = 0; i < 4; i++)
2360           {
2361             if (INTEGERP (flags[i])
2362                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2363                 || (charset = get_charset_id (flags[i])) >= 0)
2364               {
2365                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2366                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2367               }
2368             else if (EQ (flags[i], Qt))
2369               {
2370                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2371                 default_reg_bits |= 1 << i;
2372               }
2373             else if (CONSP (flags[i]))
2374               {
2375                 Lisp_Object tail = flags[i];
2376
2377                 if (INTEGERP (XCONS (tail)->car)
2378                     && (charset = XINT (XCONS (tail)->car),
2379                         CHARSET_VALID_P (charset))
2380                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2381                   {
2382                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2383                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2384                   }
2385                 else
2386                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2387                 tail = XCONS (tail)->cdr;
2388                 while (CONSP (tail))
2389                   {
2390                     if (INTEGERP (XCONS (tail)->car)
2391                         && (charset = XINT (XCONS (tail)->car),
2392                             CHARSET_VALID_P (charset))
2393                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2394                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2395                         = i;
2396                     else if (EQ (XCONS (tail)->car, Qt))
2397                       default_reg_bits |= 1 << i;
2398                     tail = XCONS (tail)->cdr;
2399                   }
2400               }
2401             else
2402               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2403
2404             CODING_SPEC_ISO_DESIGNATION (coding, i)
2405               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2406           }
2407
2408         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2409           {
2410             /* REG 1 can be used only by locking shift in 7-bit env.  */
2411             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2412               default_reg_bits &= ~2;
2413             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2414               /* Without any shifting, only REG 0 and 1 can be used.  */
2415               default_reg_bits &= 3;
2416           }
2417
2418         for (charset = 0; charset <= MAX_CHARSET; charset++)
2419           if (CHARSET_VALID_P (charset)
2420               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2421                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2422             {
2423               /* We have not yet decided where to designate CHARSET.  */
2424               int reg_bits = default_reg_bits;
2425
2426               if (CHARSET_CHARS (charset) == 96)
2427                 /* A charset of CHARS96 can't be designated to REG 0.  */
2428                 reg_bits &= ~1;
2429
2430               if (reg_bits)
2431                 /* There exist some default graphic register.  */
2432                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2433                   = (reg_bits & 1
2434                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2435               else
2436                 /* We anyway have to designate CHARSET to somewhere.  */
2437                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2438                   = (CHARSET_CHARS (charset) == 94
2439                      ? 0
2440                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2441                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2442                         ? 1
2443                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2444                            ? 2 : 0)));
2445             }
2446       }
2447       coding->require_flushing = 1;
2448       break;
2449
2450     case 3:
2451       coding->type = coding_type_big5;
2452       coding->flags
2453         = (NILP (XVECTOR (coding_system)->contents[4])
2454            ? CODING_FLAG_BIG5_HKU
2455            : CODING_FLAG_BIG5_ETEN);
2456       break;
2457
2458     case 4:
2459       coding->type = coding_type_ccl;
2460       {
2461         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2462         if (CONSP  (val)
2463             && VECTORP (XCONS (val)->car)
2464             && VECTORP (XCONS (val)->cdr))
2465           {
2466             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2467             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2468           }
2469         else
2470           goto label_invalid_coding_system;
2471       }
2472       coding->require_flushing = 1;
2473       break;
2474
2475     default:
2476       if (EQ (type, Qt))
2477         coding->type = coding_type_undecided;
2478       else
2479         coding->type = coding_type_no_conversion;
2480       break;
2481     }
2482   return 0;
2483
2484  label_invalid_coding_system:
2485   coding->type = coding_type_no_conversion;
2486   coding->eol_type = CODING_EOL_LF;
2487   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2488     = Qnil;
2489   return -1;
2490 }
2491
2492 /* Emacs has a mechanism to automatically detect a coding system if it
2493    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2494    it's impossible to distinguish some coding systems accurately
2495    because they use the same range of codes.  So, at first, coding
2496    systems are categorized into 7, those are:
2497
2498    o coding-category-emacs-mule
2499
2500         The category for a coding system which has the same code range
2501         as Emacs' internal format.  Assigned the coding-system (Lisp
2502         symbol) `emacs-mule' by default.
2503
2504    o coding-category-sjis
2505
2506         The category for a coding system which has the same code range
2507         as SJIS.  Assigned the coding-system (Lisp
2508         symbol) `japanese-shift-jis' by default.
2509
2510    o coding-category-iso-7
2511
2512         The category for a coding system which has the same code range
2513         as ISO2022 of 7-bit environment.  This doesn't use any locking
2514         shift and single shift functions.  Assigned the coding-system
2515         (Lisp symbol) `iso-2022-7bit' by default.
2516
2517    o coding-category-iso-8-1
2518
2519         The category for a coding system which has the same code range
2520         as ISO2022 of 8-bit environment and graphic plane 1 used only
2521         for DIMENSION1 charset.  This doesn't use any locking shift
2522         and single shift functions.  Assigned the coding-system (Lisp
2523         symbol) `iso-latin-1' by default.
2524
2525    o coding-category-iso-8-2
2526
2527         The category for a coding system which has the same code range
2528         as ISO2022 of 8-bit environment and graphic plane 1 used only
2529         for DIMENSION2 charset.  This doesn't use any locking shift
2530         and single shift functions.  Assigned the coding-system (Lisp
2531         symbol) `japanese-iso-8bit' by default.
2532
2533    o coding-category-iso-7-else
2534
2535         The category for a coding system which has the same code range
2536         as ISO2022 of 7-bit environemnt but uses locking shift or
2537         single shift functions.  Assigned the coding-system (Lisp
2538         symbol) `iso-2022-7bit-lock' by default.
2539
2540    o coding-category-iso-8-else
2541
2542         The category for a coding system which has the same code range
2543         as ISO2022 of 8-bit environemnt but uses locking shift or
2544         single shift functions.  Assigned the coding-system (Lisp
2545         symbol) `iso-2022-8bit-ss2' by default.
2546
2547    o coding-category-big5
2548
2549         The category for a coding system which has the same code range
2550         as BIG5.  Assigned the coding-system (Lisp symbol)
2551         `cn-big5' by default.
2552
2553    o coding-category-binary
2554
2555         The category for a coding system not categorized in any of the
2556         above.  Assigned the coding-system (Lisp symbol)
2557         `no-conversion' by default.
2558
2559    Each of them is a Lisp symbol and the value is an actual
2560    `coding-system's (this is also a Lisp symbol) assigned by a user.
2561    What Emacs does actually is to detect a category of coding system.
2562    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2563    decide only one possible category, it selects a category of the
2564    highest priority.  Priorities of categories are also specified by a
2565    user in a Lisp variable `coding-category-list'.
2566
2567 */
2568
2569 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2570    If it detects possible coding systems, return an integer in which
2571    appropriate flag bits are set.  Flag bits are defined by macros
2572    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2573
2574 int
2575 detect_coding_mask (src, src_bytes)
2576      unsigned char *src;
2577      int src_bytes;
2578 {
2579   register unsigned char c;
2580   unsigned char *src_end = src + src_bytes;
2581   int mask;
2582
2583   /* At first, skip all ASCII characters and control characters except
2584      for three ISO2022 specific control characters.  */
2585  label_loop_detect_coding:
2586   while (src < src_end)
2587     {
2588       c = *src;
2589       if (c >= 0x80
2590           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2591         break;
2592       src++;
2593     }
2594
2595   if (src >= src_end)
2596     /* We found nothing other than ASCII.  There's nothing to do.  */
2597     return CODING_CATEGORY_MASK_ANY;
2598
2599   /* The text seems to be encoded in some multilingual coding system.
2600      Now, try to find in which coding system the text is encoded.  */
2601   if (c < 0x80)
2602     {
2603       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2604       /* C is an ISO2022 specific control code of C0.  */
2605       mask = detect_coding_iso2022 (src, src_end);
2606       src++;
2607       if (mask == CODING_CATEGORY_MASK_ANY)
2608         /* No valid ISO2022 code follows C.  Try again.  */
2609         goto label_loop_detect_coding;
2610     }
2611   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2612     /* C is an ISO2022 specific control code of C1,
2613        or the first byte of SJIS's 2-byte character code,
2614        or a leading code of Emacs.  */
2615     mask = (detect_coding_iso2022 (src, src_end)
2616             | detect_coding_sjis (src, src_end)
2617             | detect_coding_emacs_mule (src, src_end)
2618             | CODING_CATEGORY_MASK_BINARY);
2619
2620   else if (c == ISO_CODE_CSI
2621            && (src < src_end
2622               && (*src == ']'
2623                   || (src + 1 < src_end
2624                       && src[1] == ']'
2625                       && (*src == '0' || *src == '1' || *src == '2')))))
2626     /* C is an ISO2022's control-sequence-introducer.  */
2627     mask = (detect_coding_iso2022 (src, src_end)
2628             | detect_coding_sjis (src, src_end)
2629             | detect_coding_emacs_mule (src, src_end)
2630             | CODING_CATEGORY_MASK_BINARY);
2631
2632   else if (c < 0xA0)
2633     /* C is the first byte of SJIS character code,
2634        or a leading-code of Emacs.  */
2635     mask = (detect_coding_sjis (src, src_end)
2636             | detect_coding_emacs_mule (src, src_end)
2637             | CODING_CATEGORY_MASK_BINARY);
2638
2639   else
2640     /* C is a character of ISO2022 in graphic plane right,
2641        or a SJIS's 1-byte character code (i.e. JISX0201),
2642        or the first byte of BIG5's 2-byte code.  */
2643     mask = (detect_coding_iso2022 (src, src_end)
2644             | detect_coding_sjis (src, src_end)
2645             | detect_coding_big5 (src, src_end)
2646             | CODING_CATEGORY_MASK_BINARY);
2647
2648   return mask;
2649 }
2650
2651 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2652    The information of the detected coding system is set in CODING.  */
2653
2654 void
2655 detect_coding (coding, src, src_bytes)
2656      struct coding_system *coding;
2657      unsigned char *src;
2658      int src_bytes;
2659 {
2660   int mask = detect_coding_mask (src, src_bytes);
2661   int idx;
2662
2663   if (mask == CODING_CATEGORY_MASK_ANY)
2664     /* We found nothing other than ASCII.  There's nothing to do.  */
2665     return;
2666
2667   if (!mask)
2668     /* The source text seems to be encoded in unknown coding system.
2669        Emacs regards the category of such a kind of coding system as
2670        `coding-category-binary'.  We assume that a user has assigned
2671        an appropriate coding system for a `coding-category-binary'.  */
2672     idx = CODING_CATEGORY_IDX_BINARY;
2673   else
2674     {
2675       /* We found some plausible coding systems.  Let's use a coding
2676          system of the highest priority.  */
2677       Lisp_Object val = Vcoding_category_list;
2678
2679       if (CONSP (val))
2680         while (!NILP (val))
2681           {
2682             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2683             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2684               break;
2685             val = XCONS (val)->cdr;
2686           }
2687       else
2688         val = Qnil;
2689
2690       if (NILP (val))
2691         {
2692           /* For unknown reason, `Vcoding_category_list' contains none
2693              of found categories.  Let's use any of them.  */
2694           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2695             if (mask & (1 << idx))
2696               break;
2697         }
2698     }
2699   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2700 }
2701
2702 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2703    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2704    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2705
2706 #define MAX_EOL_CHECK_COUNT 3
2707
2708 int
2709 detect_eol_type (src, src_bytes)
2710      unsigned char *src;
2711      int src_bytes;
2712 {
2713   unsigned char *src_end = src + src_bytes;
2714   unsigned char c;
2715   int total = 0;                /* How many end-of-lines are found so far.  */
2716   int eol_type = CODING_EOL_UNDECIDED;
2717   int this_eol_type;
2718
2719   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2720     {
2721       c = *src++;
2722       if (c == '\n' || c == '\r')
2723         {
2724           total++;
2725           if (c == '\n')
2726             this_eol_type = CODING_EOL_LF;
2727           else if (src >= src_end || *src != '\n')
2728             this_eol_type = CODING_EOL_CR;
2729           else
2730             this_eol_type = CODING_EOL_CRLF, src++;
2731
2732           if (eol_type == CODING_EOL_UNDECIDED)
2733             /* This is the first end-of-line.  */
2734             eol_type = this_eol_type;
2735           else if (eol_type != this_eol_type)
2736             /* The found type is different from what found before.
2737                We had better not decode end-of-line.  */
2738             return CODING_EOL_LF;
2739         }
2740     }
2741
2742   return eol_type;
2743 }
2744
2745 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2746    is encoded.  If it detects an appropriate format of end-of-line, it
2747    sets the information in *CODING.  */
2748
2749 void
2750 detect_eol (coding, src, src_bytes)
2751      struct coding_system *coding;
2752      unsigned char *src;
2753      int src_bytes;
2754 {
2755   Lisp_Object val;
2756   int eol_type = detect_eol_type (src, src_bytes);
2757
2758   if (eol_type == CODING_EOL_UNDECIDED)
2759     /*  We found no end-of-line in the source text.  */
2760     return;
2761
2762   val = Fget (coding->symbol, Qeol_type);
2763   if (VECTORP (val) && XVECTOR (val)->size == 3)
2764     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2765 }
2766
2767 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2768    decoding, it may detect coding system and format of end-of-line if
2769    those are not yet decided.  */
2770
2771 int
2772 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2773      struct coding_system *coding;
2774      unsigned char *source, *destination;
2775      int src_bytes, dst_bytes;
2776      int *consumed;
2777 {
2778   int produced;
2779
2780   if (src_bytes <= 0)
2781     {
2782       *consumed = 0;
2783       return 0;
2784     }
2785
2786   if (coding->type == coding_type_undecided)
2787     detect_coding (coding, source, src_bytes);
2788
2789   if (coding->eol_type == CODING_EOL_UNDECIDED)
2790     detect_eol (coding, source, src_bytes);
2791
2792   coding->carryover_size = 0;
2793   switch (coding->type)
2794     {
2795     case coding_type_no_conversion:
2796     label_no_conversion:
2797       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2798       bcopy (source, destination, produced);
2799       *consumed = produced;
2800       break;
2801
2802     case coding_type_emacs_mule:
2803     case coding_type_undecided:
2804       if (coding->eol_type == CODING_EOL_LF
2805           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2806         goto label_no_conversion;
2807       produced = decode_eol (coding, source, destination,
2808                              src_bytes, dst_bytes, consumed);
2809       break;
2810
2811     case coding_type_sjis:
2812       produced = decode_coding_sjis_big5 (coding, source, destination,
2813                                           src_bytes, dst_bytes, consumed,
2814                                           1);
2815       break;
2816
2817     case coding_type_iso2022:
2818       produced = decode_coding_iso2022 (coding, source, destination,
2819                                         src_bytes, dst_bytes, consumed);
2820       break;
2821
2822     case coding_type_big5:
2823       produced = decode_coding_sjis_big5 (coding, source, destination,
2824                                           src_bytes, dst_bytes, consumed,
2825                                           0);
2826       break;
2827
2828     case coding_type_ccl:
2829       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2830                              src_bytes, dst_bytes, consumed);
2831       break;
2832     }
2833
2834   return produced;
2835 }
2836
2837 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2838
2839 int
2840 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2841      struct coding_system *coding;
2842      unsigned char *source, *destination;
2843      int src_bytes, dst_bytes;
2844      int *consumed;
2845 {
2846   int produced;
2847
2848   switch (coding->type)
2849     {
2850     case coding_type_no_conversion:
2851     label_no_conversion:
2852       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2853       if (produced > 0)
2854         {
2855           bcopy (source, destination, produced);
2856           if (coding->selective)
2857             {
2858               unsigned char *p = destination, *pend = destination + produced;
2859               while (p < pend)
2860                 if (*p++ == '\015') p[-1] = '\n';
2861             }
2862         }
2863       *consumed = produced;
2864       break;
2865
2866     case coding_type_emacs_mule:
2867     case coding_type_undecided:
2868       if (coding->eol_type == CODING_EOL_LF
2869           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2870         goto label_no_conversion;
2871       produced = encode_eol (coding, source, destination,
2872                              src_bytes, dst_bytes, consumed);
2873       break;
2874
2875     case coding_type_sjis:
2876       produced = encode_coding_sjis_big5 (coding, source, destination,
2877                                           src_bytes, dst_bytes, consumed,
2878                                           1);
2879       break;
2880
2881     case coding_type_iso2022:
2882       produced = encode_coding_iso2022 (coding, source, destination,
2883                                         src_bytes, dst_bytes, consumed);
2884       break;
2885
2886     case coding_type_big5:
2887       produced = encode_coding_sjis_big5 (coding, source, destination,
2888                                           src_bytes, dst_bytes, consumed,
2889                                           0);
2890       break;
2891
2892     case coding_type_ccl:
2893       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2894                              src_bytes, dst_bytes, consumed);
2895       break;
2896     }
2897
2898   return produced;
2899 }
2900
2901 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2902
2903 /* Return maximum size (bytes) of a buffer enough for decoding
2904    SRC_BYTES of text encoded in CODING.  */
2905
2906 int
2907 decoding_buffer_size (coding, src_bytes)
2908      struct coding_system *coding;
2909      int src_bytes;
2910 {
2911   int magnification;
2912
2913   if (coding->type == coding_type_iso2022)
2914     magnification = 3;
2915   else if (coding->type == coding_type_ccl)
2916     magnification = coding->spec.ccl.decoder.buf_magnification;
2917   else
2918     magnification = 2;
2919
2920   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2921 }
2922
2923 /* Return maximum size (bytes) of a buffer enough for encoding
2924    SRC_BYTES of text to CODING.  */
2925
2926 int
2927 encoding_buffer_size (coding, src_bytes)
2928      struct coding_system *coding;
2929      int src_bytes;
2930 {
2931   int magnification;
2932
2933   if (coding->type == coding_type_ccl)
2934     magnification = coding->spec.ccl.encoder.buf_magnification;
2935   else
2936     magnification = 3;
2937
2938   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2939 }
2940
2941 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2942 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2943 #endif
2944
2945 char *conversion_buffer;
2946 int conversion_buffer_size;
2947
2948 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2949    or decoding.  Sufficient memory is allocated automatically.  If we
2950    run out of memory, return NULL.  */
2951
2952 char *
2953 get_conversion_buffer (size)
2954      int size;
2955 {
2956   if (size > conversion_buffer_size)
2957     {
2958       char *buf;
2959       int real_size = conversion_buffer_size * 2;
2960
2961       while (real_size < size) real_size *= 2;
2962       buf = (char *) xmalloc (real_size);
2963       xfree (conversion_buffer);
2964       conversion_buffer = buf;
2965       conversion_buffer_size = real_size;
2966     }
2967   return conversion_buffer;
2968 }
2969
2970 \f
2971 #ifdef emacs
2972 /*** 7. Emacs Lisp library functions ***/
2973
2974 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
2975        1, 1, 0,
2976   "Return coding-spec of CODING-SYSTEM.\n\
2977 If CODING-SYSTEM is not a valid coding-system, return nil.")
2978   (obj)
2979      Lisp_Object obj;
2980 {
2981   while (SYMBOLP (obj) && !NILP (obj))
2982     obj = Fget (obj, Qcoding_system);
2983   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2984           ? Qnil : obj);
2985 }
2986
2987 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2988   "Return t if OBJECT is nil or a coding-system.\n\
2989 See document of make-coding-system for coding-system object.")
2990   (obj)
2991      Lisp_Object obj;
2992 {
2993   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
2994 }
2995
2996 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2997        Sread_non_nil_coding_system, 1, 1, 0,
2998   "Read a coding system from the minibuffer, prompting with string PROMPT.")
2999   (prompt)
3000      Lisp_Object prompt;
3001 {
3002   Lisp_Object val;
3003   do
3004     {
3005       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3006                               Qt, Qnil, Qnil, Qnil);
3007     }
3008   while (XSTRING (val)->size == 0);
3009   return (Fintern (val, Qnil));
3010 }
3011
3012 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3013   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3014   (prompt)
3015      Lisp_Object prompt;
3016 {
3017   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3018                                       Qt, Qnil, Qnil, Qnil);
3019   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3020 }
3021
3022 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3023        1, 1, 0,
3024   "Check validity of CODING-SYSTEM.\n\
3025 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3026 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3027 The value of property should be a vector of length 5.")
3028   (coding_system)
3029      Lisp_Object coding_system;
3030 {
3031   CHECK_SYMBOL (coding_system, 0);
3032   if (!NILP (Fcoding_system_p (coding_system)))
3033     return coding_system;
3034   while (1)
3035     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3036 }
3037
3038 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3039        2, 2, 0,
3040   "Detect coding system of the text in the region between START and END.\n\
3041 Return a list of possible coding systems ordered by priority.\n\
3042 If only ASCII characters are found, it returns `undecided'\n\
3043  or its subsidiary coding system according to a detected end-of-line format.")
3044   (b, e)
3045      Lisp_Object b, e;
3046 {
3047   int coding_mask, eol_type;
3048   Lisp_Object val;
3049   int beg, end;
3050
3051   validate_region (&b, &e);
3052   beg = XINT (b), end = XINT (e);
3053   if (beg < GPT && end >= GPT) move_gap (end);
3054
3055   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3056   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
3057
3058   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3059     {
3060       val = intern ("undecided");
3061       if (eol_type != CODING_EOL_UNDECIDED)
3062         {
3063           Lisp_Object val2 = Fget (val, Qeol_type);
3064           if (VECTORP (val2))
3065             val = XVECTOR (val2)->contents[eol_type];
3066         }
3067     }
3068   else
3069     {
3070       Lisp_Object val2;
3071
3072       /* At first, gather possible coding-systems in VAL in a reverse
3073          order.  */
3074       val = Qnil;
3075       for (val2 = Vcoding_category_list;
3076            !NILP (val2);
3077            val2 = XCONS (val2)->cdr)
3078         {
3079           int idx
3080             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3081           if (coding_mask & (1 << idx))
3082             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3083         }
3084
3085       /* Then, change the order of the list, while getting subsidiary
3086          coding-systems.  */
3087       val2 = val;
3088       val = Qnil;
3089       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3090         {
3091           if (eol_type == CODING_EOL_UNDECIDED)
3092             val = Fcons (XCONS (val2)->car, val);
3093           else
3094             {
3095               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3096               if (VECTORP (val3))
3097                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3098               else
3099                 val = Fcons (XCONS (val2)->car, val);
3100             }
3101         }
3102     }
3103
3104   return val;
3105 }
3106
3107 /* Scan text in the region between *BEGP and *ENDP, skip characters
3108    which we never have to encode to (iff ENCODEP is 1) or decode from
3109    coding system CODING at the head and tail, then set BEGP and ENDP
3110    to the addresses of start and end of the text we actually convert.  */
3111
3112 void
3113 shrink_conversion_area (begp, endp, coding, encodep)
3114      unsigned char **begp, **endp;
3115      struct coding_system *coding;
3116      int encodep;
3117 {
3118   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3119
3120   if (coding->eol_type != CODING_EOL_LF
3121       && coding->eol_type != CODING_EOL_UNDECIDED)
3122     /* Since we anyway have to convert end-of-line format, it is not
3123        worth skipping at most 100 bytes or so.  */
3124     return;
3125
3126   if (encodep)                  /* for encoding */
3127     {
3128       switch (coding->type)
3129         {
3130         case coding_type_no_conversion:
3131         case coding_type_emacs_mule:
3132         case coding_type_undecided:
3133           /* We need no conversion.  */
3134           *begp = *endp;
3135           return;
3136         case coding_type_ccl:
3137           /* We can't skip any data.  */
3138           return;
3139         case coding_type_iso2022:
3140           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3141             {
3142               unsigned char *bol = beg_addr;
3143               while (beg_addr < end_addr && *beg_addr < 0x80)
3144                 {
3145                   beg_addr++;
3146                   if (*(beg_addr - 1) == '\n')
3147                     bol = beg_addr;
3148                 }
3149               beg_addr = bol;
3150               goto label_skip_tail;
3151             }
3152           /* fall down ... */
3153         default:
3154           /* We can skip all ASCII characters at the head and tail.  */
3155           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3156         label_skip_tail:
3157           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3158           break;
3159         }
3160     }
3161   else                          /* for decoding */
3162     {
3163       switch (coding->type)
3164         {
3165         case coding_type_no_conversion:
3166           /* We need no conversion.  */
3167           *begp = *endp;
3168           return;
3169         case coding_type_emacs_mule:
3170           if (coding->eol_type == CODING_EOL_LF)
3171             {
3172               /* We need no conversion.  */
3173               *begp = *endp;
3174               return;
3175             }
3176           /* We can skip all but carriage-return.  */
3177           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3178           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3179           break;
3180         case coding_type_sjis:
3181         case coding_type_big5:
3182           /* We can skip all ASCII characters at the head.  */
3183           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3184           /* We can skip all ASCII characters at the tail except for
3185              the second byte of SJIS or BIG5 code.  */
3186           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3187           if (end_addr != *endp)
3188             end_addr++;
3189           break;
3190         case coding_type_ccl:
3191           /* We can't skip any data.  */
3192           return;
3193         default:                /* i.e. case coding_type_iso2022: */
3194           {
3195             unsigned char c;
3196
3197             /* We can skip all ASCII characters except for a few
3198                control codes at the head.  */
3199             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3200                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3201                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3202               beg_addr++;
3203           }
3204           break;
3205         }
3206     }
3207   *begp = beg_addr;
3208   *endp = end_addr;
3209   return;
3210 }
3211
3212 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3213    text between B and E.  B and E are buffer position.  */
3214
3215 Lisp_Object
3216 code_convert_region (b, e, coding, encodep)
3217      Lisp_Object b, e;
3218      struct coding_system *coding;
3219      int encodep;
3220 {
3221   int beg, end, len, consumed, produced;
3222   char *buf;
3223   unsigned char *begp, *endp;
3224   int pos = PT;
3225
3226   validate_region (&b, &e);
3227   beg = XINT (b), end = XINT (e);
3228   if (beg < GPT && end >= GPT)
3229     move_gap (end);
3230
3231   if (encodep && !NILP (coding->pre_write_conversion))
3232     {
3233       /* We must call a pre-conversion function which may put a new
3234          text to be converted in a new buffer.  */
3235       struct buffer *old = current_buffer, *new;
3236
3237       TEMP_SET_PT (beg);
3238       call2 (coding->pre_write_conversion, b, e);
3239       if (old != current_buffer)
3240         {
3241           /* Replace the original text by the text just generated.  */
3242           len = ZV - BEGV;
3243           new = current_buffer;
3244           set_buffer_internal (old);
3245           del_range (beg, end);
3246           insert_from_buffer (new, 1, len, 0);
3247           end = beg + len;
3248         }
3249     }
3250
3251   /* We may be able to shrink the conversion region.  */
3252   begp = POS_ADDR (beg); endp = begp + (end - beg);
3253   shrink_conversion_area (&begp, &endp, coding, encodep);
3254
3255   if (begp == endp)
3256     /* We need no conversion.  */
3257     len = end - beg;
3258   else
3259     {
3260       beg += begp - POS_ADDR (beg);
3261       end =  beg + (endp - begp);
3262
3263       if (encodep)
3264         len = encoding_buffer_size (coding, end - beg);
3265       else
3266         len = decoding_buffer_size (coding, end - beg);
3267       buf = get_conversion_buffer (len);
3268
3269       coding->last_block = 1;
3270       produced = (encodep
3271                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3272                                    &consumed)
3273                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3274                                    &consumed));
3275
3276       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3277
3278       TEMP_SET_PT (beg);
3279       insert (buf, produced);
3280       del_range (PT, PT + end - beg);
3281       if (pos >= end)
3282         pos = PT + (pos - end);
3283       else if (pos > beg)
3284         pos = beg;
3285       TEMP_SET_PT (pos);
3286   }
3287
3288   if (!encodep && !NILP (coding->post_read_conversion))
3289     {
3290       /* We must call a post-conversion function which may alter
3291          the text just converted.  */
3292       Lisp_Object insval;
3293
3294       beg = XINT (b);
3295       TEMP_SET_PT (beg);
3296       insval = call1 (coding->post_read_conversion, make_number (len));
3297       CHECK_NUMBER (insval, 0);
3298       len = XINT (insval);
3299     }
3300
3301   return make_number (len);
3302 }
3303
3304 Lisp_Object
3305 code_convert_string (str, coding, encodep, nocopy)
3306      Lisp_Object str, nocopy;
3307      struct coding_system *coding;
3308      int encodep;
3309 {
3310   int len, consumed, produced;
3311   char *buf;
3312   unsigned char *begp, *endp;
3313   int head_skip, tail_skip;
3314   struct gcpro gcpro1;
3315
3316   if (encodep && !NILP (coding->pre_write_conversion)
3317       || !encodep && !NILP (coding->post_read_conversion))
3318     {
3319       /* Since we have to call Lisp functions which assume target text
3320          is in a buffer, after setting a temporary buffer, call
3321          code_convert_region.  */
3322       int count = specpdl_ptr - specpdl;
3323       int len = XSTRING (str)->size;
3324       Lisp_Object result;
3325       struct buffer *old = current_buffer;
3326
3327       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3328       temp_output_buffer_setup (" *code-converting-work*");
3329       set_buffer_internal (XBUFFER (Vstandard_output));
3330       insert_from_string (str, 0, len, 0);
3331       code_convert_region (make_number (BEGV), make_number (ZV),
3332                            coding, encodep);
3333       result = make_buffer_string (BEGV, ZV, 0);
3334       set_buffer_internal (old);
3335       return unbind_to (count, result);
3336     }
3337
3338   /* We may be able to shrink the conversion region.  */
3339   begp = XSTRING (str)->data;
3340   endp = begp + XSTRING (str)->size;
3341   shrink_conversion_area (&begp, &endp, coding, encodep);
3342
3343   if (begp == endp)
3344     /* We need no conversion.  */
3345     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3346
3347   head_skip = begp - XSTRING (str)->data;
3348   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3349
3350   GCPRO1 (str);
3351
3352   if (encodep)
3353     len = encoding_buffer_size (coding, endp - begp);
3354   else
3355     len = decoding_buffer_size (coding, endp - begp);
3356   buf = get_conversion_buffer (len + head_skip + tail_skip);
3357
3358   bcopy (XSTRING (str)->data, buf, head_skip);
3359   coding->last_block = 1;
3360   produced = (encodep
3361               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3362                                buf + head_skip, endp - begp, len, &consumed)
3363               : decode_coding (coding, XSTRING (str)->data + head_skip,
3364                                buf + head_skip, endp - begp, len, &consumed));
3365   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3366          buf + head_skip + produced,
3367          tail_skip);
3368
3369   UNGCPRO;
3370
3371   return make_string (buf, head_skip + produced + tail_skip);
3372 }
3373
3374 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3375        3, 3, "r\nzCoding system: ",
3376   "Decode current region by specified coding system.\n\
3377 When called from a program, takes three arguments:\n\
3378 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3379 Return length of decoded text.")
3380   (b, e, coding_system)
3381      Lisp_Object b, e, coding_system;
3382 {
3383   struct coding_system coding;
3384
3385   CHECK_NUMBER_COERCE_MARKER (b, 0);
3386   CHECK_NUMBER_COERCE_MARKER (e, 1);
3387   CHECK_SYMBOL (coding_system, 2);
3388
3389   if (NILP (coding_system))
3390     return make_number (XFASTINT (e) - XFASTINT (b));
3391   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3392     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3393
3394   return code_convert_region (b, e, &coding, 0);
3395 }
3396
3397 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3398        3, 3, "r\nzCoding system: ",
3399   "Encode current region by specified coding system.\n\
3400 When called from a program, takes three arguments:\n\
3401 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3402 Return length of encoded text.")
3403   (b, e, coding_system)
3404      Lisp_Object b, e, coding_system;
3405 {
3406   struct coding_system coding;
3407
3408   CHECK_NUMBER_COERCE_MARKER (b, 0);
3409   CHECK_NUMBER_COERCE_MARKER (e, 1);
3410   CHECK_SYMBOL (coding_system, 2);
3411
3412   if (NILP (coding_system))
3413     return make_number (XFASTINT (e) - XFASTINT (b));
3414   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3415     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3416
3417   return code_convert_region (b, e, &coding, 1);
3418 }
3419
3420 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3421        2, 3, 0,
3422   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3423 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3424 of decoding.")
3425   (string, coding_system, nocopy)
3426      Lisp_Object string, coding_system, nocopy;
3427 {
3428   struct coding_system coding;
3429
3430   CHECK_STRING (string, 0);
3431   CHECK_SYMBOL (coding_system, 1);
3432
3433   if (NILP (coding_system))
3434     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3435   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3436     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3437
3438   return code_convert_string (string, &coding, 0, nocopy);
3439 }
3440
3441 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3442        2, 3, 0,
3443   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3444 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3445 of encoding.")
3446   (string, coding_system, nocopy)
3447      Lisp_Object string, coding_system, nocopy;
3448 {
3449   struct coding_system coding;
3450
3451   CHECK_STRING (string, 0);
3452   CHECK_SYMBOL (coding_system, 1);
3453
3454   if (NILP (coding_system))
3455     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3456   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3457     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3458
3459   return code_convert_string (string, &coding, 1, nocopy);
3460 }
3461
3462 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3463   "Decode a JISX0208 character of shift-jis encoding.\n\
3464 CODE is the character code in SJIS.\n\
3465 Return the corresponding character.")
3466   (code)
3467      Lisp_Object code;
3468 {
3469   unsigned char c1, c2, s1, s2;
3470   Lisp_Object val;
3471
3472   CHECK_NUMBER (code, 0);
3473   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3474   DECODE_SJIS (s1, s2, c1, c2);
3475   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3476   return val;
3477 }
3478
3479 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3480   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3481 Return the corresponding character code in SJIS.")
3482   (ch)
3483      Lisp_Object ch;
3484 {
3485   int charset, c1, c2, s1, s2;
3486   Lisp_Object val;
3487
3488   CHECK_NUMBER (ch, 0);
3489   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3490   if (charset == charset_jisx0208)
3491     {
3492       ENCODE_SJIS (c1, c2, s1, s2);
3493       XSETFASTINT (val, (s1 << 8) | s2);
3494     }
3495   else
3496     XSETFASTINT (val, 0);
3497   return val;
3498 }
3499
3500 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3501   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3502 CODE is the character code in BIG5.\n\
3503 Return the corresponding character.")
3504   (code)
3505      Lisp_Object code;
3506 {
3507   int charset;
3508   unsigned char b1, b2, c1, c2;
3509   Lisp_Object val;
3510
3511   CHECK_NUMBER (code, 0);
3512   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3513   DECODE_BIG5 (b1, b2, charset, c1, c2);
3514   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3515   return val;
3516 }
3517
3518 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3519   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3520 Return the corresponding character code in Big5.")
3521   (ch)
3522      Lisp_Object ch;
3523 {
3524   int charset, c1, c2, b1, b2;
3525   Lisp_Object val;
3526
3527   CHECK_NUMBER (ch, 0);
3528   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3529   if (charset == charset_big5_1 || charset == charset_big5_2)
3530     {
3531       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3532       XSETFASTINT (val, (b1 << 8) | b2);
3533     }
3534   else
3535     XSETFASTINT (val, 0);
3536   return val;
3537 }
3538
3539 DEFUN ("set-terminal-coding-system-internal",
3540        Fset_terminal_coding_system_internal,
3541        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3542   (coding_system)
3543      Lisp_Object coding_system;
3544 {
3545   CHECK_SYMBOL (coding_system, 0);
3546   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3547   return Qnil;
3548 }
3549
3550 DEFUN ("terminal-coding-system",
3551        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3552   "Return coding-system of your terminal.")
3553   ()
3554 {
3555   return terminal_coding.symbol;
3556 }
3557
3558 DEFUN ("set-keyboard-coding-system-internal",
3559        Fset_keyboard_coding_system_internal,
3560        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3561   (coding_system)
3562      Lisp_Object coding_system;
3563 {
3564   CHECK_SYMBOL (coding_system, 0);
3565   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3566   return Qnil;
3567 }
3568
3569 DEFUN ("keyboard-coding-system",
3570        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3571   "Return coding-system of what is sent from terminal keyboard.")
3572   ()
3573 {
3574   return keyboard_coding.symbol;
3575 }
3576
3577 \f
3578 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3579        Sfind_operation_coding_system,  1, MANY, 0,
3580   "Choose a coding system for an operation based on the target name.\n\
3581 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3582 DECODING-SYSTEM is the coding system to use for decoding\n\
3583 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3584 for encoding (in case OPERATION does encoding).\n\
3585 \n\
3586 The first argument OPERATION specifies an I/O primitive:\n\
3587   For file I/O, `insert-file-contents' or `write-region'.\n\
3588   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3589   For network I/O, `open-network-stream'.\n\
3590 \n\
3591 The remaining arguments should be the same arguments that were passed\n\
3592 to the primitive.  Depending on which primitive, one of those arguments\n\
3593 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3594 whichever argument specifies the file name is TARGET.\n\
3595 \n\
3596 TARGET has a meaning which depends on OPERATION:\n\
3597   For file I/O, TARGET is a file name.\n\
3598   For process I/O, TARGET is a process name.\n\
3599   For network I/O, TARGET is a service name or a port number\n\
3600 \n\
3601 This function looks up what specified for TARGET in,\n\
3602 `file-coding-system-alist', `process-coding-system-alist',\n\
3603 or `network-coding-system-alist' depending on OPERATION.\n\
3604 They may specify a coding system, a cons of coding systems,\n\
3605 or a function symbol to call.\n\
3606 In the last case, we call the function with one argument,\n\
3607 which is a list of all the arguments given to this function.")
3608   (nargs, args)
3609      int nargs;
3610      Lisp_Object *args;
3611 {
3612   Lisp_Object operation, target_idx, target, val;
3613   register Lisp_Object chain;
3614
3615   if (nargs < 2)
3616     error ("Too few arguments");
3617   operation = args[0];
3618   if (!SYMBOLP (operation)
3619       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3620     error ("Invalid first arguement");
3621   if (nargs < 1 + XINT (target_idx))
3622     error ("Too few arguments for operation: %s",
3623            XSYMBOL (operation)->name->data);
3624   target = args[XINT (target_idx) + 1];
3625   if (!(STRINGP (target)
3626         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3627     error ("Invalid %dth argument", XINT (target_idx) + 1);
3628
3629   chain = ((EQ (operation, Qinsert_file_contents)
3630             || EQ (operation, Qwrite_region))
3631            ? Vfile_coding_system_alist
3632            : (EQ (operation, Qopen_network_stream)
3633               ? Vnetwork_coding_system_alist
3634               : Vprocess_coding_system_alist));
3635   if (NILP (chain))
3636     return Qnil;
3637
3638   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3639     {
3640       Lisp_Object elt = XCONS (chain)->car;
3641
3642       if (CONSP (elt)
3643           && ((STRINGP (target)
3644                && STRINGP (XCONS (elt)->car)
3645                && fast_string_match (XCONS (elt)->car, target) >= 0)
3646               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3647         {
3648           val = XCONS (elt)->cdr;
3649           if (CONSP (val))
3650             return val;
3651           if (! SYMBOLP (val))
3652             return Qnil;
3653           if (! NILP (Fcoding_system_p (val)))
3654             return Fcons (val, val);
3655           if (!NILP (Ffboundp (val)))
3656             return call1 (val, Flist (nargs, args));
3657           return Qnil;
3658         }
3659     }
3660   return Qnil;
3661 }
3662
3663 #endif /* emacs */
3664
3665 \f
3666 /*** 8. Post-amble ***/
3667
3668 init_coding_once ()
3669 {
3670   int i;
3671
3672   /* Emacs' internal format specific initialize routine.  */
3673   for (i = 0; i <= 0x20; i++)
3674     emacs_code_class[i] = EMACS_control_code;
3675   emacs_code_class[0x0A] = EMACS_linefeed_code;
3676   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3677   for (i = 0x21 ; i < 0x7F; i++)
3678     emacs_code_class[i] = EMACS_ascii_code;
3679   emacs_code_class[0x7F] = EMACS_control_code;
3680   emacs_code_class[0x80] = EMACS_leading_code_composition;
3681   for (i = 0x81; i < 0xFF; i++)
3682     emacs_code_class[i] = EMACS_invalid_code;
3683   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3684   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3685   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3686   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3687
3688   /* ISO2022 specific initialize routine.  */
3689   for (i = 0; i < 0x20; i++)
3690     iso_code_class[i] = ISO_control_code;
3691   for (i = 0x21; i < 0x7F; i++)
3692     iso_code_class[i] = ISO_graphic_plane_0;
3693   for (i = 0x80; i < 0xA0; i++)
3694     iso_code_class[i] = ISO_control_code;
3695   for (i = 0xA1; i < 0xFF; i++)
3696     iso_code_class[i] = ISO_graphic_plane_1;
3697   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3698   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3699   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3700   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3701   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3702   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3703   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3704   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3705   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3706   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3707
3708   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3709   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3710
3711   setup_coding_system (Qnil, &keyboard_coding);
3712   setup_coding_system (Qnil, &terminal_coding);
3713
3714 #if defined (MSDOS) || defined (WINDOWSNT)
3715   system_eol_type = CODING_EOL_CRLF;
3716 #else
3717   system_eol_type = CODING_EOL_LF;
3718 #endif
3719 }
3720
3721 #ifdef emacs
3722
3723 syms_of_coding ()
3724 {
3725   Qtarget_idx = intern ("target-idx");
3726   staticpro (&Qtarget_idx);
3727
3728   /* Target FILENAME is the first argument.  */
3729   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3730   /* Target FILENAME is the third argument.  */
3731   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3732
3733   Qcall_process = intern ("call-process");
3734   staticpro (&Qcall_process);
3735   /* Target PROGRAM is the first argument.  */
3736   Fput (Qcall_process, Qtarget_idx, make_number (0));
3737
3738   Qcall_process_region = intern ("call-process-region");
3739   staticpro (&Qcall_process_region);
3740   /* Target PROGRAM is the third argument.  */
3741   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3742
3743   Qstart_process = intern ("start-process");
3744   staticpro (&Qstart_process);
3745   /* Target PROGRAM is the third argument.  */
3746   Fput (Qstart_process, Qtarget_idx, make_number (2));
3747
3748   Qopen_network_stream = intern ("open-network-stream");
3749   staticpro (&Qopen_network_stream);
3750   /* Target SERVICE is the fourth argument.  */
3751   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3752
3753   Qcoding_system = intern ("coding-system");
3754   staticpro (&Qcoding_system);
3755
3756   Qeol_type = intern ("eol-type");
3757   staticpro (&Qeol_type);
3758
3759   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3760   staticpro (&Qbuffer_file_coding_system);
3761
3762   Qpost_read_conversion = intern ("post-read-conversion");
3763   staticpro (&Qpost_read_conversion);
3764
3765   Qpre_write_conversion = intern ("pre-write-conversion");
3766   staticpro (&Qpre_write_conversion);
3767
3768   Qcoding_system_spec = intern ("coding-system-spec");
3769   staticpro (&Qcoding_system_spec);
3770
3771   Qcoding_system_p = intern ("coding-system-p");
3772   staticpro (&Qcoding_system_p);
3773
3774   Qcoding_system_error = intern ("coding-system-error");
3775   staticpro (&Qcoding_system_error);
3776
3777   Fput (Qcoding_system_error, Qerror_conditions,
3778         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3779   Fput (Qcoding_system_error, Qerror_message,
3780         build_string ("Invalid coding system"));
3781
3782   Qcoding_category_index = intern ("coding-category-index");
3783   staticpro (&Qcoding_category_index);
3784
3785   {
3786     int i;
3787     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3788       {
3789         coding_category_table[i] = intern (coding_category_name[i]);
3790         staticpro (&coding_category_table[i]);
3791         Fput (coding_category_table[i], Qcoding_category_index,
3792               make_number (i));
3793       }
3794   }
3795
3796   Qcharacter_unification_table = intern ("character-unification-table");
3797   staticpro (&Qcharacter_unification_table);
3798   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3799         make_number (0));
3800
3801   Qcharacter_unification_table_for_decode
3802     = intern ("character-unification-table-for-decode");
3803   staticpro (&Qcharacter_unification_table_for_decode);
3804
3805   Qcharacter_unification_table_for_encode
3806     = intern ("character-unification-table-for-encode");
3807   staticpro (&Qcharacter_unification_table_for_encode);
3808
3809   Qemacs_mule = intern ("emacs-mule");
3810   staticpro (&Qemacs_mule);
3811
3812   defsubr (&Scoding_system_spec);
3813   defsubr (&Scoding_system_p);
3814   defsubr (&Sread_coding_system);
3815   defsubr (&Sread_non_nil_coding_system);
3816   defsubr (&Scheck_coding_system);
3817   defsubr (&Sdetect_coding_region);
3818   defsubr (&Sdecode_coding_region);
3819   defsubr (&Sencode_coding_region);
3820   defsubr (&Sdecode_coding_string);
3821   defsubr (&Sencode_coding_string);
3822   defsubr (&Sdecode_sjis_char);
3823   defsubr (&Sencode_sjis_char);
3824   defsubr (&Sdecode_big5_char);
3825   defsubr (&Sencode_big5_char);
3826   defsubr (&Sset_terminal_coding_system_internal);
3827   defsubr (&Sterminal_coding_system);
3828   defsubr (&Sset_keyboard_coding_system_internal);
3829   defsubr (&Skeyboard_coding_system);
3830   defsubr (&Sfind_operation_coding_system);
3831
3832   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3833     "List of coding-categories (symbols) ordered by priority.");
3834   {
3835     int i;
3836
3837     Vcoding_category_list = Qnil;
3838     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3839       Vcoding_category_list
3840         = Fcons (coding_category_table[i], Vcoding_category_list);
3841   }
3842
3843   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3844     "Specify the coding system for read operations.\n\
3845 It is useful to bind this variable with `let', but do not set it globally.\n\
3846 If the value is a coding system, it is used for decoding on read operation.\n\
3847 If not, an appropriate element is used from one of the coding system alists:\n\
3848 There are three such tables, `file-coding-system-alist',\n\
3849 `process-coding-system-alist', and `network-coding-system-alist'.");
3850   Vcoding_system_for_read = Qnil;
3851
3852   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3853     "Specify the coding system for write operations.\n\
3854 It is useful to bind this variable with `let', but do not set it globally.\n\
3855 If the value is a coding system, it is used for encoding on write operation.\n\
3856 If not, an appropriate element is used from one of the coding system alists:\n\
3857 There are three such tables, `file-coding-system-alist',\n\
3858 `process-coding-system-alist', and `network-coding-system-alist'.");
3859   Vcoding_system_for_write = Qnil;
3860
3861   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3862     "Coding system used in the latest file or process I/O.");
3863   Vlast_coding_system_used = Qnil;
3864
3865   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3866     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3867   inhibit_eol_conversion = 0;
3868
3869   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3870     "Alist to decide a coding system to use for a file I/O operation.\n\
3871 The format is ((PATTERN . VAL) ...),\n\
3872 where PATTERN is a regular expression matching a file name,\n\
3873 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3874 If VAL is a coding system, it is used for both decoding and encoding\n\
3875 the file contents.\n\
3876 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3877 and the cdr part is used for encoding.\n\
3878 If VAL is a function symbol, the function must return a coding system\n\
3879 or a cons of coding systems which are used as above.\n\
3880 \n\
3881 See also the function `find-operation-coding-system'.");
3882   Vfile_coding_system_alist = Qnil;
3883
3884   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3885     "Alist to decide a coding system to use for a process I/O operation.\n\
3886 The format is ((PATTERN . VAL) ...),\n\
3887 where PATTERN is a regular expression matching a program name,\n\
3888 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3889 If VAL is a coding system, it is used for both decoding what received\n\
3890 from the program and encoding what sent to the program.\n\
3891 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3892 and the cdr part is used for encoding.\n\
3893 If VAL is a function symbol, the function must return a coding system\n\
3894 or a cons of coding systems which are used as above.\n\
3895 \n\
3896 See also the function `find-operation-coding-system'.");
3897   Vprocess_coding_system_alist = Qnil;
3898
3899   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3900     "Alist to decide a coding system to use for a network I/O operation.\n\
3901 The format is ((PATTERN . VAL) ...),\n\
3902 where PATTERN is a regular expression matching a network service name\n\
3903 or is a port number to connect to,\n\
3904 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3905 If VAL is a coding system, it is used for both decoding what received\n\
3906 from the network stream and encoding what sent to the network stream.\n\
3907 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3908 and the cdr part is used for encoding.\n\
3909 If VAL is a function symbol, the function must return a coding system\n\
3910 or a cons of coding systems which are used as above.\n\
3911 \n\
3912 See also the function `find-operation-coding-system'.");
3913   Vnetwork_coding_system_alist = Qnil;
3914
3915   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3916     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3917   eol_mnemonic_unix = ':';
3918
3919   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3920     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3921   eol_mnemonic_dos = '\\';
3922
3923   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3924     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3925   eol_mnemonic_mac = '/';
3926
3927   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3928     "Mnemonic character indicating end-of-line format is not yet decided.");
3929   eol_mnemonic_undecided = ':';
3930
3931   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3932     "Non-nil means ISO 2022 encoder/decoder do character unification.");
3933   Venable_character_unification = Qt;
3934
3935   DEFVAR_LISP ("standard-character-unification-table-for-decode",
3936     &Vstandard_character_unification_table_for_decode,
3937     "Table for unifying characters when reading.");
3938   Vstandard_character_unification_table_for_decode = Qnil;
3939
3940   DEFVAR_LISP ("standard-character-unification-table-for-encode",
3941     &Vstandard_character_unification_table_for_encode,
3942     "Table for unifying characters when writing.");
3943   Vstandard_character_unification_table_for_encode = Qnil;
3944
3945   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3946     "Alist of charsets vs revision numbers.\n\
3947 While encoding, if a charset (car part of an element) is found,\n\
3948 designate it with the escape sequence identifing revision (cdr part of the element).");
3949   Vcharset_revision_alist = Qnil;
3950
3951   DEFVAR_LISP ("default-process-coding-system",
3952                &Vdefault_process_coding_system,
3953     "Cons of coding systems used for process I/O by default.\n\
3954 The car part is used for decoding a process output,\n\
3955 the cdr part is used for encoding a text to be sent to a process.");
3956   Vdefault_process_coding_system = Qnil;
3957 }
3958
3959 #endif /* emacs */