src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Ver.1.0.
   3    Copyright (C) 1995 Free Software Foundation, Inc.
   4    Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   1. Preamble
  26   2. Emacs' internal format handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. End-of-line handlers
  30   6. C library functions
  31   7. Emacs Lisp library functions
  32   8. Post-amble
  33
  34 */
  35
  36 /*** GENERAL NOTE on CODING SYSTEM ***
  37
  38   Coding system is an encoding mechanism of one or more character
  39   sets.  Here's a list of coding systems which Emacs can handle.  When
  40   we say "decode", it means converting some other coding system to
  41   Emacs' internal format, and when we say "encode", it means
  42   converting Emacs' internal format to some other coding system.
  43
  44   0. Emacs' internal format
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in the section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and such coding
  53   systems used in Internet communication as ISO-2022-JP are all
  54   variants of ISO2022.  Details are described in the section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   the section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in the section 4.  In this file, when written as "BIG5"
  67   (all uppercase), it means the coding system, and when written as
  68   "Big5" (capitalized), it means the character set.
  69
  70   4. Else
  71
  72   If a user want to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represent a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See the section 6 for more
  81   detail.
  82
  83 */
  84
  85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  86
  87   How end-of-line of a text is encoded depends on a system.  For
  88   instance, Unix's format is just one byte of `line-feed' code,
  89   whereas DOS's format is two bytes sequence of `carriage-return' and
  90   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  91
  92   Since how characters in a text is encoded and how end-of-line is
  93   encoded is independent, any coding system described above can take
  94   any format of end-of-line.  So, Emacs has information of format of
  95   end-of-line in each coding-system.  See the section 6 for more
  96   detail.
  97
  98 */
  99
 100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 101
 102   These functions check if a text between SRC and SRC_END is encoded
 103   in the coding system category XXX.  Each returns an integer value in
 104   which appropriate flag bits for the category XXX is set.  The flag
 105   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 106   template of these functions.  */
 107 #if 0
 108 int
 109 detect_coding_internal (src, src_end)
 110      unsigned char *src, *src_end;
 111 {
 112   ...
 113 }
 114 #endif
 115
 116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 117
 118   These functions decode SRC_BYTES length text at SOURCE encoded in
 119   CODING to Emacs' internal format.  The resulting text goes to a
 120   place pointed by DESTINATION, the length of which should not exceed
 121   DST_BYTES.  The bytes actually processed is returned as *CONSUMED.
 122   The return value is the length of the decoded text.  Below is a
 123   template of these functions.  */
 124 #if 0
 125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 126      struct coding_system *coding;
 127      unsigned char *source, *destination;
 128      int src_bytes, dst_bytes;
 129      int *consumed;
 130 {
 131   ...
 132 }
 133 #endif
 134
 135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 136
 137   These functions encode SRC_BYTES length text at SOURCE of Emacs
 138   internal format to CODING.  The resulting text goes to a place
 139   pointed by DESTINATION, the length of which should not exceed
 140   DST_BYTES.  The bytes actually processed is returned as *CONSUMED.
 141   The return value is the length of the encoded text.  Below is a
 142   template of these functions.  */
 143 #if 0
 144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148      int *consumed;
 149 {
 150   ...
 151 }
 152 #endif
 153
 154 /*** COMMONLY USED MACROS ***/
 155
 156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 157    THREE_MORE_BYTES safely get one, two, and three bytes from the
 158    source text respectively.  If there are not enough bytes in the
 159    source, they jump to `label_end_of_loop'.  The caller should set
 160    variables `src' and `src_end' to appropriate areas in advance.  */
 161
 162 #define ONE_MORE_BYTE(c1)       \
 163   do {                          \
 164     if (src < src_end)          \
 165       c1 = *src++;              \
 166     else                        \
 167       goto label_end_of_loop;   \
 168   } while (0)
 169
 170 #define TWO_MORE_BYTES(c1, c2)  \
 171   do {                          \
 172     if (src + 1 < src_end)      \
 173       c1 = *src++, c2 = *src++; \
 174     else                        \
 175       goto label_end_of_loop;   \
 176   } while (0)
 177
 178 #define THREE_MORE_BYTES(c1, c2, c3)            \
 179   do {                                          \
 180     if (src + 2 < src_end)                      \
 181       c1 = *src++, c2 = *src++, c3 = *src++;    \
 182     else                                        \
 183       goto label_end_of_loop;                   \
 184   } while (0)
 185
 186 /* The following three macros DECODE_CHARACTER_ASCII,
 187    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 188    the multi-byte form of a character of each class at the place
 189    pointed by `dst'.  The caller should set the variable `dst' to
 190    point to an appropriate area and the variable `coding' to point to
 191    the coding-system of the currently decoding text in advance.  */
 192
 193 /* Decode one ASCII character C.  */
 194
 195 #define DECODE_CHARACTER_ASCII(c)                               \
 196   do {                                                          \
 197     if (COMPOSING_P (coding->composing))                        \
 198       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 199     else                                                        \
 200       *dst++ = (c);                                             \
 201   } while (0)
 202
 203 /* Decode one DIMENSION1 character of which charset is CHARSET and
 204    position-code is C.  */
 205
 206 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 207   do {                                                                  \
 208     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 209     if (COMPOSING_P (coding->composing))                                \
 210       *dst++ = leading_code + 0x20;                                     \
 211     else                                                                \
 212       *dst++ = leading_code;                                            \
 213     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 214       *dst++ = leading_code;                                            \
 215     *dst++ = (c) | 0x80;                                                \
 216   } while (0)
 217
 218 /* Decode one DIMENSION2 character of which charset is CHARSET and
 219    position-codes are C1 and C2.  */
 220
 221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 222   do {                                                  \
 223     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 224     *dst++ = (c2) | 0x80;                               \
 225   } while (0)
 226
 227 \f
 228 /*** 1. Preamble ***/
 229
 230 #include <stdio.h>
 231
 232 #ifdef emacs
 233
 234 #include <config.h>
 235 #include "lisp.h"
 236 #include "buffer.h"
 237 #include "charset.h"
 238 #include "ccl.h"
 239 #include "coding.h"
 240 #include "window.h"
 241
 242 #else  /* not emacs */
 243
 244 #include "mulelib.h"
 245
 246 #endif /* not emacs */
 247
 248 Lisp_Object Qcoding_system, Qeol_type;
 249 Lisp_Object Qbuffer_file_coding_system;
 250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 251
 252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 254 Lisp_Object Qstart_process, Qopen_network_stream;
 255 Lisp_Object Qtarget_idx;
 256
 257 /* Mnemonic character of each format of end-of-line.  */
 258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 259 /* Mnemonic character to indicate format of end-of-line is not yet
 260    decided.  */
 261 int eol_mnemonic_undecided;
 262
 263 #ifdef emacs
 264
 265 Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
 266
 267 /* Coding-systems are handed between Emacs Lisp programs and C internal
 268    routines by the following three variables.  */
 269 /* Coding-system for reading files and receiving data from process.  */
 270 Lisp_Object Vcoding_system_for_read;
 271 /* Coding-system for writing files and sending data to process.  */
 272 Lisp_Object Vcoding_system_for_write;
 273 /* Coding-system actually used in the latest I/O.  */
 274 Lisp_Object Vlast_coding_system_used;
 275
 276 /* Coding-system of what terminal accept for displaying.  */
 277 struct coding_system terminal_coding;
 278
 279 /* Coding-system of what is sent from terminal keyboard.  */
 280 struct coding_system keyboard_coding;
 281
 282 Lisp_Object Vcoding_system_alist;
 283
 284 #endif /* emacs */
 285
 286 Lisp_Object Qcoding_category_index;
 287
 288 /* List of symbols `coding-category-xxx' ordered by priority.  */
 289 Lisp_Object Vcoding_category_list;
 290
 291 /* Table of coding-systems currently assigned to each coding-category.  */
 292 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 293
 294 /* Table of names of symbol for each coding-category.  */
 295 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 296   "coding-category-internal",
 297   "coding-category-sjis",
 298   "coding-category-iso-7",
 299   "coding-category-iso-8-1",
 300   "coding-category-iso-8-2",
 301   "coding-category-iso-else",
 302   "coding-category-big5",
 303   "coding-category-binary"
 304 };
 305
 306 /* Alist of charsets vs the alternate charsets.  */
 307 Lisp_Object Valternate_charset_table;
 308
 309 /* Alist of charsets vs revision number.  */
 310 Lisp_Object Vcharset_revision_alist;
 311
 312 \f
 313 /*** 2. Emacs internal format handlers ***/
 314
 315 /* Emacs' internal format for encoding multiple character sets is a
 316    kind of multi-byte encoding, i.e. encoding a character by a sequence
 317    of one-byte codes of variable length.  ASCII characters and control
 318    characters (e.g. `tab', `newline') are represented by one-byte as
 319    is.  It takes the range 0x00 through 0x7F.  The other characters
 320    are represented by a sequence of `base leading-code', optional
 321    `extended leading-code', and one or two `position-code's.  Length
 322    of the sequence is decided by the base leading-code.  Leading-code
 323    takes the range 0x80 through 0x9F, whereas extended leading-code
 324    and position-code take the range 0xA0 through 0xFF.  See the
 325    document of `charset.h' for more detail about leading-code and
 326    position-code.
 327
 328    There's one exception in this rule.  Special leading-code
 329    `leading-code-composition' denotes that the following several
 330    characters should be composed into one character.  Leading-codes of
 331    components (except for ASCII) are added 0x20.  An ASCII character
 332    component is represented by a 2-byte sequence of `0xA0' and
 333    `ASCII-code + 0x80'.  See also the document in `charset.h' for the
 334    detail of composite character.  Hence, we can summarize the code
 335    range as follows:
 336
 337    --- CODE RANGE of Emacs' internal format ---
 338    (character set)      (range)
 339    ASCII                0x00 .. 0x7F
 340    ELSE (1st byte)      0x80 .. 0x9F
 341         (rest bytes)    0xA0 .. 0xFF
 342    ---------------------------------------------
 343
 344   */
 345
 346 enum emacs_code_class_type emacs_code_class[256];
 347
 348 /* Go to the next statement only if *SRC is accessible and the code is
 349    greater than 0xA0.  */
 350 #define CHECK_CODE_RANGE_A0_FF  \
 351   do {                          \
 352     if (src >= src_end)         \
 353       goto label_end_of_switch; \
 354     else if (*src++ < 0xA0)     \
 355       return 0;                 \
 356   } while (0)
 357
 358 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 359    Check if a text is encoded in Emacs' internal format.  If it is,
 360    return CODING_CATEGORY_MASK_INTERNAL, else return 0.  */
 361
 362 int
 363 detect_coding_internal (src, src_end)
 364      unsigned char *src, *src_end;
 365 {
 366   unsigned char c;
 367   int composing = 0;
 368
 369   while (src < src_end)
 370     {
 371       c = *src++;
 372
 373       if (composing)
 374         {
 375           if (c < 0xA0)
 376             composing = 0;
 377           else
 378             c -= 0x20;
 379         }
 380
 381       switch (emacs_code_class[c])
 382         {
 383         case EMACS_ascii_code:
 384         case EMACS_linefeed_code:
 385           break;
 386
 387         case EMACS_control_code:
 388           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 389             return 0;
 390           break;
 391
 392         case EMACS_invalid_code:
 393           return 0;
 394
 395         case EMACS_leading_code_composition: /* c == 0x80 */
 396           if (composing)
 397             CHECK_CODE_RANGE_A0_FF;
 398           else
 399             composing = 1;
 400           break;
 401
 402         case EMACS_leading_code_4:
 403           CHECK_CODE_RANGE_A0_FF;
 404           /* fall down to check it two more times ...  */
 405
 406         case EMACS_leading_code_3:
 407           CHECK_CODE_RANGE_A0_FF;
 408           /* fall down to check it one more time ...  */
 409
 410         case EMACS_leading_code_2:
 411           CHECK_CODE_RANGE_A0_FF;
 412           break;
 413
 414         default:
 415         label_end_of_switch:
 416           break;
 417         }
 418     }
 419   return CODING_CATEGORY_MASK_INTERNAL;
 420 }
 421
 422 \f
 423 /*** 3. ISO2022 handlers ***/
 424
 425 /* The following note describes the coding system ISO2022 briefly.
 426    Since the intension of this note is to help understanding of the
 427    programs in this file, some parts are NOT ACCURATE or OVERLY
 428    SIMPLIFIED.  For the thorough understanding, please refer to the
 429    original document of ISO2022.
 430
 431    ISO2022 provides many mechanisms to encode several character sets
 432    in 7-bit and 8-bit environment.  If one choose 7-bite environment,
 433    all text is encoded by codes of less than 128.  This may make the
 434    encoded text a little bit longer, but the text get more stability
 435    to pass through several gateways (some of them split MSB off).
 436
 437    There are two kind of character set: control character set and
 438    graphic character set.  The former contains control characters such
 439    as `newline' and `escape' to provide control functions (control
 440    functions are provided also by escape sequence).  The latter
 441    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 442    two control character sets and many graphic character sets.
 443
 444    Graphic character sets are classified into one of the following
 445    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 446    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 447    bytes (DIMENSION) and the number of characters in one dimension
 448    (CHARS) of the set.  In addition, each character set is assigned an
 449    identification tag (called "final character" and denoted as <F>
 450    here after) which is unique in each class.  <F> of each character
 451    set is decided by ECMA(*) when it is registered in ISO.  Code range
 452    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 453
 454    Note (*): ECMA = European Computer Manufacturers Association
 455
 456    Here are examples of graphic character set [NAME(<F>)]:
 457         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 458         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 459         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 460         o DIMENSION2_CHARS96 -- none for the moment
 461
 462    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 463         C0 [0x00..0x1F] -- control character plane 0
 464         GL [0x20..0x7F] -- graphic character plane 0
 465         C1 [0x80..0x9F] -- control character plane 1
 466         GR [0xA0..0xFF] -- graphic character plane 1
 467
 468    A control character set is directly designated and invoked to C0 or
 469    C1 by an escape sequence.  The most common case is that ISO646's
 470    control character set is designated/invoked to C0 and ISO6429's
 471    control character set is designated/invoked to C1, and usually
 472    these designations/invocations are omitted in a coded text.  With
 473    7-bit environment, only C0 can be used, and a control character for
 474    C1 is encoded by an appropriate escape sequence to fit in the
 475    environment.  All control characters for C1 are defined the
 476    corresponding escape sequences.
 477
 478    A graphic character set is at first designated to one of four
 479    graphic registers (G0 through G3), then these graphic registers are
 480    invoked to GL or GR.  These designations and invocations can be
 481    done independently.  The most common case is that G0 is invoked to
 482    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 483    these invocations and designations are omitted in a coded text.
 484    With 7-bit environment, only GL can be used.
 485
 486    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 487    and 0x7F of GL area work as control characters SPACE and DEL
 488    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 489
 490    There are two ways of invocation: locking-shift and single-shift.
 491    With locking-shift, the invocation lasts until the next different
 492    invocation, whereas with single-shift, the invocation works only
 493    for the following character and doesn't affect locking-shift.
 494    Invocations are done by the following control characters or escape
 495    sequences.
 496
 497    ----------------------------------------------------------------------
 498    function             control char    escape sequence description
 499    ----------------------------------------------------------------------
 500    SI  (shift-in)               0x0F    none            invoke G0 to GL
 501    SI  (shift-out)              0x0E    none            invoke G1 to GL
 502    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 503    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 504    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 505    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 506    ----------------------------------------------------------------------
 507    The first four are for locking-shift.  Control characters for these
 508    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 509
 510    Designations are done by the following escape sequences.
 511    ----------------------------------------------------------------------
 512    escape sequence      description
 513    ----------------------------------------------------------------------
 514    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 515    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 516    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 517    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 518    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 519    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 520    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 521    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 522    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 523    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 524    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 525    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 526    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 527    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 528    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 529    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 530    ----------------------------------------------------------------------
 531
 532    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 533    of dimension 1, chars 94, and final character <F>, and etc.
 534
 535    Note (*): Although these designations are not allowed in ISO2022,
 536    Emacs accepts them on decoding, and produces them on encoding
 537    CHARS96 character set in a coding system which is characterized as
 538    7-bit environment, non-locking-shift, and non-single-shift.
 539
 540    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 541    '(' can be omitted.  We call this as "short-form" here after.
 542
 543    Now you may notice that there are a lot of ways for encoding the
 544    same multilingual text in ISO2022.  Actually, there exist many
 545    coding systems such as Compound Text (used in X's inter client
 546    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 547    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 548    localized platforms), and all of these are variants of ISO2022.
 549
 550    In addition to the above, Emacs handles two more kinds of escape
 551    sequences: ISO6429's direction specification and Emacs' private
 552    sequence for specifying character composition.
 553
 554    ISO6429's direction specification takes the following format:
 555         o CSI ']'      -- end of the current direction
 556         o CSI '0' ']'  -- end of the current direction
 557         o CSI '1' ']'  -- start of left-to-right text
 558         o CSI '2' ']'  -- start of right-to-left text
 559    The control character CSI (0x9B: control sequence introducer) is
 560    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 561
 562    Character composition specification takes the following format:
 563         o ESC '0' -- start character composition
 564         o ESC '1' -- end character composition
 565    Since these are not standard escape sequences of any ISO, the use
 566    of them for these meaning is restricted to Emacs only.  */
 567
 568 enum iso_code_class_type iso_code_class[256];
 569
 570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 571    Check if a text is encoded in ISO2022.  If it is, returns an
 572    integer in which appropriate flag bits any of:
 573         CODING_CATEGORY_MASK_ISO_7
 574         CODING_CATEGORY_MASK_ISO_8_1
 575         CODING_CATEGORY_MASK_ISO_8_2
 576         CODING_CATEGORY_MASK_ISO_ELSE
 577    are set.  If a code which should never appear in ISO2022 is found,
 578    returns 0.  */
 579
 580 int
 581 detect_coding_iso2022 (src, src_end)
 582      unsigned char *src, *src_end;
 583 {
 584   int mask = CODING_CATEGORY_MASK_ANY;
 585   int g1 = 0;                   /* 1 iff designating to G1.  */
 586   int c, i;
 587
 588   while (src < src_end)
 589     {
 590       c = *src++;
 591       switch (c)
 592         {
 593         case ISO_CODE_ESC:
 594           if (src >= src_end)
 595             break;
 596           c = *src++;
 597           if (src < src_end
 598               && ((c >= '(' && c <= '/')
 599                   || c == '$' && ((*src >= '(' && *src <= '/')
 600                                   || (*src >= '@' && *src <= 'B'))))
 601             {
 602               /* Valid designation sequence.  */
 603               mask &= (CODING_CATEGORY_MASK_ISO_7
 604                        | CODING_CATEGORY_MASK_ISO_8_1
 605                        | CODING_CATEGORY_MASK_ISO_8_2
 606                        | CODING_CATEGORY_MASK_ISO_ELSE);
 607               if (c == ')' || (c == '$' && *src == ')'))
 608                 {
 609                   g1 = 1;
 610                   mask &= ~CODING_CATEGORY_MASK_ISO_7;
 611                 }
 612               src++;
 613               break;
 614             }
 615           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 616             return CODING_CATEGORY_MASK_ISO_ELSE;
 617           break;
 618
 619         case ISO_CODE_SO:
 620           if (g1)
 621             return CODING_CATEGORY_MASK_ISO_ELSE;
 622           break;
 623
 624         case ISO_CODE_CSI:
 625         case ISO_CODE_SS2:
 626         case ISO_CODE_SS3:
 627           mask &= ~CODING_CATEGORY_MASK_ISO_7;
 628           break;
 629
 630         default:
 631           if (c < 0x80)
 632             break;
 633           else if (c < 0xA0)
 634             return 0;
 635           else
 636             {
 637               int count = 1;
 638
 639               mask &= ~CODING_CATEGORY_MASK_ISO_7;
 640               while (src < src_end && *src >= 0xA0)
 641                 count++, src++;
 642               if (count & 1 && src < src_end)
 643                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 644             }
 645           break;
 646         }
 647     }
 648
 649   return mask;
 650 }
 651
 652 /* Decode a character of which charset is CHARSET and the 1st position
 653    code is C1.  If dimension of CHARSET 2, the 2nd position code is
 654    fetched from SRC and set to C2.  If CHARSET is negative, it means
 655    that we are decoding ill formed text, and what we can do is just to
 656    read C1 as is.  */
 657
 658 #define DECODE_ISO_CHARACTER(charset, c1)                       \
 659   do {                                                          \
 660     if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2)     \
 661       ONE_MORE_BYTE (c2);                                       \
 662     if (COMPOSING_HEAD_P (coding->composing))                   \
 663       {                                                         \
 664         *dst++ = LEADING_CODE_COMPOSITION;                      \
 665         if (COMPOSING_WITH_RULE_P (coding->composing))          \
 666           /* To tell composition rules are embeded.  */         \
 667           *dst++ = 0xFF;                                        \
 668         coding->composing += 2;                                 \
 669       }                                                         \
 670     if ((charset) < 0)                                          \
 671       *dst++ = c1;                                              \
 672     else if ((charset) == CHARSET_ASCII)                        \
 673       DECODE_CHARACTER_ASCII (c1);                              \
 674     else if (CHARSET_DIMENSION (charset) == 1)                  \
 675       DECODE_CHARACTER_DIMENSION1 (charset, c1);                \
 676     else                                                        \
 677       DECODE_CHARACTER_DIMENSION2 (charset, c1, c2);            \
 678     if (COMPOSING_WITH_RULE_P (coding->composing))              \
 679       /* To tell a composition rule follows.  */                \
 680       coding->composing = COMPOSING_WITH_RULE_RULE;             \
 681   } while (0)
 682
 683 /* Set designation state into CODING.  */
 684 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 685   do {                                                                  \
 686     int charset = ISO_CHARSET_TABLE (dimension, chars, final_char);     \
 687     Lisp_Object temp                                                    \
 688       = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table);     \
 689     if (! NILP (temp))                                                  \
 690       charset = get_charset_id (XCONS (temp)->cdr);                     \
 691     if (charset >= 0)                                                   \
 692       {                                                                 \
 693         if (coding->direction == 1                                      \
 694             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 695           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 696         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 697       }                                                                 \
 698   } while (0)
 699
 700 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 701
 702 int
 703 decode_coding_iso2022 (coding, source, destination,
 704                        src_bytes, dst_bytes, consumed)
 705      struct coding_system *coding;
 706      unsigned char *source, *destination;
 707      int src_bytes, dst_bytes;
 708      int *consumed;
 709 {
 710   unsigned char *src = source;
 711   unsigned char *src_end = source + src_bytes;
 712   unsigned char *dst = destination;
 713   unsigned char *dst_end = destination + dst_bytes;
 714   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 715      from DST_END to assure that overflow checking is necessary only
 716      at the head of loop.  */
 717   unsigned char *adjusted_dst_end = dst_end - 6;
 718   int charset;
 719   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 720   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 721   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 722
 723   while (src < src_end && dst < adjusted_dst_end)
 724     {
 725       /* SRC_BASE remembers the start position in source in each loop.
 726          The loop will be exited when there's not enough source text
 727          to analyze long escape sequence or 2-byte code (within macros
 728          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 729          to SRC_BASE before exiting.  */
 730       unsigned char *src_base = src;
 731       unsigned char c1 = *src++, c2, cmprule;
 732
 733       switch (iso_code_class [c1])
 734         {
 735         case ISO_0x20_or_0x7F:
 736           if (!coding->composing
 737               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 738             {
 739               /* This is SPACE or DEL.  */
 740               *dst++ = c1;
 741               break;
 742             }
 743           /* This is a graphic character, we fall down ...  */
 744
 745         case ISO_graphic_plane_0:
 746           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 747             {
 748               /* This is a composition rule.  */
 749               *dst++ = c1 | 0x80;
 750               coding->composing = COMPOSING_WITH_RULE_TAIL;
 751             }
 752           else
 753             DECODE_ISO_CHARACTER (charset0, c1);
 754           break;
 755
 756         case ISO_0xA0_or_0xFF:
 757           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 758             {
 759               /* Invalid code.  */
 760               *dst++ = c1;
 761               break;
 762             }
 763           /* This is a graphic character, we fall down ... */
 764
 765         case ISO_graphic_plane_1:
 766           DECODE_ISO_CHARACTER (charset1, c1);
 767           break;
 768
 769         case ISO_control_code:
 770           /* All ISO2022 control characters in this class have the
 771              same representation in Emacs internal format.  */
 772           *dst++ = c1;
 773           break;
 774
 775         case ISO_carriage_return:
 776           if (coding->eol_type == CODING_EOL_CR)
 777             {
 778               *dst++ = '\n';
 779             }
 780           else if (coding->eol_type == CODING_EOL_CRLF)
 781             {
 782               ONE_MORE_BYTE (c1);
 783               if (c1 == ISO_CODE_LF)
 784                 *dst++ = '\n';
 785               else
 786                 {
 787                   src--;
 788                   *dst++ = c1;
 789                 }
 790             }
 791           else
 792             {
 793               *dst++ = c1;
 794             }
 795           break;
 796
 797         case ISO_shift_out:
 798           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 799             goto label_invalid_escape_sequence;
 800           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 801           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 802           break;
 803
 804         case ISO_shift_in:
 805           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 806           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 807           break;
 808
 809         case ISO_single_shift_2_7:
 810         case ISO_single_shift_2:
 811           /* SS2 is handled as an escape sequence of ESC 'N' */
 812           c1 = 'N';
 813           goto label_escape_sequence;
 814
 815         case ISO_single_shift_3:
 816           /* SS2 is handled as an escape sequence of ESC 'O' */
 817           c1 = 'O';
 818           goto label_escape_sequence;
 819
 820         case ISO_control_sequence_introducer:
 821           /* CSI is handled as an escape sequence of ESC '[' ...  */
 822           c1 = '[';
 823           goto label_escape_sequence;
 824
 825         case ISO_escape:
 826           ONE_MORE_BYTE (c1);
 827         label_escape_sequence:
 828           /* Escape sequences handled by Emacs are invocation,
 829              designation, direction specification, and character
 830              composition specification.  */
 831           switch (c1)
 832             {
 833             case '&':           /* revision of following character set */
 834               ONE_MORE_BYTE (c1);
 835               if (!(c1 >= '@' && c1 <= '~'))
 836                 goto label_invalid_escape_sequence;
 837               ONE_MORE_BYTE (c1);
 838               if (c1 != ISO_CODE_ESC)
 839                 goto label_invalid_escape_sequence;
 840               ONE_MORE_BYTE (c1);
 841               goto label_escape_sequence;
 842
 843             case '$':           /* designation of 2-byte character set */
 844               ONE_MORE_BYTE (c1);
 845               if (c1 >= '@' && c1 <= 'B')
 846                 {       /* designation of JISX0208.1978, GB2312.1980,
 847                                    or JISX0208.1980 */
 848                   DECODE_DESIGNATION (0, 2, 94, c1);
 849                 }
 850               else if (c1 >= 0x28 && c1 <= 0x2B)
 851                 {       /* designation of DIMENSION2_CHARS94 character set */
 852                   ONE_MORE_BYTE (c2);
 853                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 854                 }
 855               else if (c1 >= 0x2C && c1 <= 0x2F)
 856                 {       /* designation of DIMENSION2_CHARS96 character set */
 857                   ONE_MORE_BYTE (c2);
 858                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 859                 }
 860               else
 861                 goto label_invalid_escape_sequence;
 862               break;
 863
 864             case 'n':           /* invocation of locking-shift-2 */
 865               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 866                 goto label_invalid_escape_sequence;
 867               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 868               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 869               break;
 870
 871             case 'o':           /* invocation of locking-shift-3 */
 872               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 873                 goto label_invalid_escape_sequence;
 874               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 875               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 876               break;
 877
 878             case 'N':           /* invocation of single-shift-2 */
 879               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 880                 goto label_invalid_escape_sequence;
 881               ONE_MORE_BYTE (c1);
 882               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 883               DECODE_ISO_CHARACTER (charset, c1);
 884               break;
 885
 886             case 'O':           /* invocation of single-shift-3 */
 887               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 888                 goto label_invalid_escape_sequence;
 889               ONE_MORE_BYTE (c1);
 890               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 891               DECODE_ISO_CHARACTER (charset, c1);
 892               break;
 893
 894             case '0':           /* start composing without embeded rules */
 895               coding->composing = COMPOSING_NO_RULE_HEAD;
 896               break;
 897
 898             case '1':           /* end composing */
 899               coding->composing = COMPOSING_NO;
 900               break;
 901
 902             case '2':           /* start composing with embeded rules */
 903               coding->composing = COMPOSING_WITH_RULE_HEAD;
 904               break;
 905
 906             case '[':           /* specification of direction */
 907               /* For the moment, nested direction is not supported.
 908                  So, the value of `coding->direction' is 0 or 1: 0
 909                  means left-to-right, 1 means right-to-left.  */
 910               ONE_MORE_BYTE (c1);
 911               switch (c1)
 912                 {
 913                 case ']':       /* end of the current direction */
 914                   coding->direction = 0;
 915
 916                 case '0':       /* end of the current direction */
 917                 case '1':       /* start of left-to-right direction */
 918                   ONE_MORE_BYTE (c1);
 919                   if (c1 == ']')
 920                     coding->direction = 0;
 921                   else
 922                     goto label_invalid_escape_sequence;
 923                   break;
 924
 925                 case '2':       /* start of right-to-left direction */
 926                   ONE_MORE_BYTE (c1);
 927                   if (c1 == ']')
 928                     coding->direction= 1;
 929                   else
 930                     goto label_invalid_escape_sequence;
 931                   break;
 932
 933                 default:
 934                   goto label_invalid_escape_sequence;
 935                 }
 936               break;
 937
 938             default:
 939               if (c1 >= 0x28 && c1 <= 0x2B)
 940                 {       /* designation of DIMENSION1_CHARS94 character set */
 941                   ONE_MORE_BYTE (c2);
 942                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
 943                 }
 944               else if (c1 >= 0x2C && c1 <= 0x2F)
 945                 {       /* designation of DIMENSION1_CHARS96 character set */
 946                   ONE_MORE_BYTE (c2);
 947                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
 948                 }
 949               else
 950                 {
 951                   goto label_invalid_escape_sequence;
 952                 }
 953             }
 954           /* We must update these variables now.  */
 955           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 956           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 957           break;
 958
 959         label_invalid_escape_sequence:
 960           {
 961             int length = src - src_base;
 962
 963             bcopy (src_base, dst, length);
 964             dst += length;
 965           }
 966         }
 967       continue;
 968
 969     label_end_of_loop:
 970       coding->carryover_size = src - src_base;
 971       bcopy (src_base, coding->carryover, coding->carryover_size);
 972       src = src_base;
 973       break;
 974     }
 975
 976   /* If this is the last block of the text to be decoded, we had
 977      better just flush out all remaining codes in the text although
 978      they are not valid characters.  */
 979   if (coding->last_block)
 980     {
 981       bcopy (src, dst, src_end - src);
 982       dst += (src_end - src);
 983       src = src_end;
 984     }
 985   *consumed = src - source;
 986   return dst - destination;
 987 }
 988
 989 /* ISO2022 encoding staffs.  */
 990
 991 /*
 992    It is not enough to say just "ISO2022" on encoding, but we have to
 993    specify more details.  In Emacs, each coding-system of ISO2022
 994    variant has the following specifications:
 995         1. Initial designation to G0 thru G3.
 996         2. Allows short-form designation?
 997         3. ASCII should be designated to G0 before control characters?
 998         4. ASCII should be designated to G0 at end of line?
 999         5. 7-bit environment or 8-bit environment?
1000         6. Use locking-shift?
1001         7. Use Single-shift?
1002    And the following two are only for Japanese:
1003         8. Use ASCII in place of JIS0201-1976-Roman?
1004         9. Use JISX0208-1983 in place of JISX0208-1978?
1005    These specifications are encoded in `coding->flags' as flag bits
1006    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1007    detail.
1008 */
1009
1010 /* Produce codes (escape sequence) for designating CHARSET to graphic
1011    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1012    the coding system CODING allows, produce designation sequence of
1013    short-form.  */
1014
1015 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1016   do {                                                                  \
1017     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1018     char *intermediate_char_94 = "()*+";                                \
1019     char *intermediate_char_96 = ",-./";                                \
1020     Lisp_Object temp                                                    \
1021       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1022     if (! NILP (temp))                                                  \
1023         {                                                               \
1024         *dst++ = ISO_CODE_ESC;                                          \
1025         *dst++ = '&';                                                   \
1026         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1027       }                                                                 \
1028     *dst++ = ISO_CODE_ESC;                                              \
1029     if (CHARSET_DIMENSION (charset) == 1)                               \
1030       {                                                                 \
1031         if (CHARSET_CHARS (charset) == 94)                              \
1032           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1033         else                                                            \
1034           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1035       }                                                                 \
1036     else                                                                \
1037       {                                                                 \
1038         *dst++ = '$';                                                   \
1039         if (CHARSET_CHARS (charset) == 94)                              \
1040           {                                                             \
1041             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1042                 || reg != 0                                             \
1043                 || final_char < '@' || final_char > 'B')                \
1044               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1045           }                                                             \
1046         else                                                            \
1047           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1048       }                                                                 \
1049     *dst++ = final_char;                                                \
1050     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1051   } while (0)
1052
1053 /* The following two macros produce codes (control character or escape
1054    sequence) for ISO2022 single-shift functions (single-shift-2 and
1055    single-shift-3).  */
1056
1057 #define ENCODE_SINGLE_SHIFT_2                           \
1058   do {                                                  \
1059     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1060       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1061     else                                                \
1062       *dst++ = ISO_CODE_SS2;                            \
1063     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1064   } while (0)
1065
1066 #define ENCODE_SINGLE_SHIFT_3                           \
1067   do {                                                  \
1068     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1069       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1070     else                                                \
1071       *dst++ = ISO_CODE_SS3;                            \
1072     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1073   } while (0)
1074
1075 /* The following four macros produce codes (control character or
1076    escape sequence) for ISO2022 locking-shift functions (shift-in,
1077    shift-out, locking-shift-2, and locking-shift-3).  */
1078
1079 #define ENCODE_SHIFT_IN                         \
1080   do {                                          \
1081     *dst++ = ISO_CODE_SI;                       \
1082     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1083   } while (0)
1084
1085 #define ENCODE_SHIFT_OUT                        \
1086   do {                                          \
1087     *dst++ = ISO_CODE_SO;                       \
1088     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1089   } while (0)
1090
1091 #define ENCODE_LOCKING_SHIFT_2                  \
1092   do {                                          \
1093     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1094     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1095   } while (0)
1096
1097 #define ENCODE_LOCKING_SHIFT_3                  \
1098   do {                                          \
1099     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1100     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1101   } while (0)
1102
1103 /* Produce codes for a DIMENSION1 character of which character set is
1104    CHARSET and position-code is C1.  Designation and invocation
1105    sequences are also produced in advance if necessary.  */
1106
1107
1108 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1109   do {                                                                  \
1110     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1111       {                                                                 \
1112         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1113           *dst++ = c1 & 0x7F;                                           \
1114         else                                                            \
1115           *dst++ = c1 | 0x80;                                           \
1116         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1117         break;                                                          \
1118       }                                                                 \
1119     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1120       {                                                                 \
1121         *dst++ = c1 & 0x7F;                                             \
1122         break;                                                          \
1123       }                                                                 \
1124     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1125       {                                                                 \
1126         *dst++ = c1 | 0x80;                                             \
1127         break;                                                          \
1128       }                                                                 \
1129     else                                                                \
1130       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1131          must invoke it, or, at first, designate it to some graphic     \
1132          register.  Then repeat the loop to actually produce the        \
1133          character.  */                                                 \
1134       dst = encode_invocation_designation (charset, coding, dst);       \
1135   } while (1)
1136
1137 /* Produce codes for a DIMENSION2 character of which character set is
1138    CHARSET and position-codes are C1 and C2.  Designation and
1139    invocation codes are also produced in advance if necessary.  */
1140
1141 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1142   do {                                                                  \
1143     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1144       {                                                                 \
1145         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1146           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1147         else                                                            \
1148           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1149         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1150         break;                                                          \
1151       }                                                                 \
1152     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1153       {                                                                 \
1154         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1155         break;                                                          \
1156       }                                                                 \
1157     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1158       {                                                                 \
1159         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1160         break;                                                          \
1161       }                                                                 \
1162     else                                                                \
1163       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1164          must invoke it, or, at first, designate it to some graphic     \
1165          register.  Then repeat the loop to actually produce the        \
1166          character.  */                                                 \
1167       dst = encode_invocation_designation (charset, coding, dst);       \
1168   } while (1)
1169
1170 /* Produce designation and invocation codes at a place pointed by DST
1171    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1172    Return new DST.  */
1173
1174 unsigned char *
1175 encode_invocation_designation (charset, coding, dst)
1176      int charset;
1177      struct coding_system *coding;
1178      unsigned char *dst;
1179 {
1180   int reg;                      /* graphic register number */
1181
1182   /* At first, check designations.  */
1183   for (reg = 0; reg < 4; reg++)
1184     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1185       break;
1186
1187   if (reg >= 4)
1188     {
1189       /* CHARSET is not yet designated to any graphic registers.  */
1190       /* At first check the requested designation.  */
1191       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1192       if (reg < 0)
1193         /* Since CHARSET requests no special designation, designate to
1194            graphic register 0.  */
1195         reg = 0;
1196
1197       ENCODE_DESIGNATION (charset, reg, coding);
1198     }
1199
1200   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1201       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1202     {
1203       /* Since the graphic register REG is not invoked to any graphic
1204          planes, invoke it to graphic plane 0.  */
1205       switch (reg)
1206         {
1207         case 0:                 /* graphic register 0 */
1208           ENCODE_SHIFT_IN;
1209           break;
1210
1211         case 1:                 /* graphic register 1 */
1212           ENCODE_SHIFT_OUT;
1213           break;
1214
1215         case 2:                 /* graphic register 2 */
1216           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1217             ENCODE_SINGLE_SHIFT_2;
1218           else
1219             ENCODE_LOCKING_SHIFT_2;
1220           break;
1221
1222         case 3:                 /* graphic register 3 */
1223           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1224             ENCODE_SINGLE_SHIFT_3;
1225           else
1226             ENCODE_LOCKING_SHIFT_3;
1227           break;
1228         }
1229     }
1230   return dst;
1231 }
1232
1233 /* The following two macros produce codes for indicating composition.  */
1234 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1235 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1236 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1237
1238 /* The following three macros produce codes for indicating direction
1239    of text.  */
1240 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1241   do {                                                  \
1242     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1243       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1244     else                                                \
1245       *dst++ = ISO_CODE_CSI;                            \
1246   } while (0)
1247
1248 #define ENCODE_DIRECTION_R2L    \
1249   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1250
1251 #define ENCODE_DIRECTION_L2R    \
1252   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1253
1254 /* Produce codes for designation and invocation to reset the graphic
1255    planes and registers to initial state.  */
1256 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1257   do {                                                                      \
1258     int reg;                                                                \
1259     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1260       ENCODE_SHIFT_IN;                                                      \
1261     for (reg = 0; reg < 4; reg++)                                           \
1262       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1263           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1264               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1265         ENCODE_DESIGNATION                                                  \
1266           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1267   } while (0)
1268
1269 int
1270 encode_designation_at_bol (coding, src, src_end, dstp)
1271      struct coding_system *coding;
1272      unsigned char *src, *src_end, **dstp;
1273 {
1274   int charset, reg, r[4];
1275   unsigned char *dst = *dstp, c;
1276   for (reg = 0; reg < 4; reg++) r[reg] = -1;
1277   while (src < src_end && (c = *src++) != '\n')
1278     {
1279       switch (emacs_code_class[c])
1280         {
1281         case EMACS_ascii_code:
1282           charset = CHARSET_ASCII;
1283           break;
1284         case EMACS_leading_code_2:
1285           if (++src >= src_end) continue;
1286           charset = c;
1287           break;
1288         case EMACS_leading_code_3:
1289           if ((src += 2) >= src_end) continue;
1290           charset =  (c < LEADING_CODE_PRIVATE_11 ? c : *(src - 2));
1291           break;
1292         case EMACS_leading_code_4:
1293           if ((src += 3) >= src_end) continue;
1294           charset = *(src - 3);
1295           break;
1296         default:
1297           continue;
1298         }
1299       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1300       if (r[reg] < 0
1301           && CODING_SPEC_ISO_DESIGNATION (coding, reg) != charset)
1302         r[reg] = charset;
1303     }
1304   if (c != '\n' && !coding->last_block)
1305     return -1;
1306   for (reg = 0; reg < 4; reg++)
1307     if (r[reg] >= 0)
1308       ENCODE_DESIGNATION (r[reg], reg, coding);
1309   *dstp = dst;
1310   return 0;
1311 }
1312
1313 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1314
1315 int
1316 encode_coding_iso2022 (coding, source, destination,
1317                        src_bytes, dst_bytes, consumed)
1318      struct coding_system *coding;
1319      unsigned char *source, *destination;
1320      int src_bytes, dst_bytes;
1321      int *consumed;
1322 {
1323   unsigned char *src = source;
1324   unsigned char *src_end = source + src_bytes;
1325   unsigned char *dst = destination;
1326   unsigned char *dst_end = destination + dst_bytes;
1327   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1328      from DST_END to assure overflow checking is necessary only at the
1329      head of loop.  */
1330   unsigned char *adjusted_dst_end = dst_end - 19;
1331
1332   while (src < src_end && dst < adjusted_dst_end)
1333     {
1334       /* SRC_BASE remembers the start position in source in each loop.
1335          The loop will be exited when there's not enough source text
1336          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1337          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1338          reset to SRC_BASE before exiting.  */
1339       unsigned char *src_base = src;
1340       unsigned char c1, c2, c3, c4;
1341       int charset;
1342
1343       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1344           && CODING_SPEC_ISO_BOL (coding))
1345         {
1346           /* We have to produce destination sequences now.  */
1347           if (encode_designation_at_bol (coding, src, src_end, &dst) < 0)
1348             /* We can't find end of line in the current block.  Let's
1349              repeat encoding starting from the current position
1350              pointed by SRC.  */
1351             break;
1352           CODING_SPEC_ISO_BOL (coding) = 0;
1353         }
1354
1355       c1 = *src++;
1356       /* If we are seeing a component of a composite character, we are
1357          seeing a leading-code specially encoded for composition, or a
1358          composition rule if composing with rule.  We must set C1
1359          to a normal leading-code or an ASCII code.  If we are not at
1360          a composed character, we must reset the composition state.  */
1361       if (COMPOSING_P (coding->composing))
1362         {
1363           if (c1 < 0xA0)
1364             {
1365               /* We are not in a composite character any longer.  */
1366               coding->composing = COMPOSING_NO;
1367               ENCODE_COMPOSITION_END;
1368             }
1369           else
1370             {
1371               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1372                 {
1373                   *dst++ = c1 & 0x7F;
1374                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1375                   continue;
1376                 }
1377               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1378                 coding->composing = COMPOSING_WITH_RULE_RULE;
1379               if (c1 == 0xA0)
1380                 {
1381                   /* This is an ASCII component.  */
1382                   ONE_MORE_BYTE (c1);
1383                   c1 &= 0x7F;
1384                 }
1385               else
1386                 /* This is a leading-code of non ASCII component.  */
1387                 c1 -= 0x20;
1388             }
1389         }
1390
1391       /* Now encode one character.  C1 is a control character, an
1392          ASCII character, or a leading-code of multi-byte character.  */
1393       switch (emacs_code_class[c1])
1394         {
1395         case EMACS_ascii_code:
1396           ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1);
1397           break;
1398
1399         case EMACS_control_code:
1400           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1401             ENCODE_RESET_PLANE_AND_REGISTER;
1402           *dst++ = c1;
1403           break;
1404
1405         case EMACS_carriage_return_code:
1406           if (!coding->selective)
1407             {
1408               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1409                 ENCODE_RESET_PLANE_AND_REGISTER;
1410               *dst++ = c1;
1411               break;
1412             }
1413           /* fall down to treat '\r' as '\n' ...  */
1414
1415         case EMACS_linefeed_code:
1416           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1417             ENCODE_RESET_PLANE_AND_REGISTER;
1418           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1419             bcopy (coding->spec.iso2022.initial_designation,
1420                    coding->spec.iso2022.current_designation,
1421                    sizeof coding->spec.iso2022.initial_designation);
1422           if (coding->eol_type == CODING_EOL_LF
1423               || coding->eol_type == CODING_EOL_AUTOMATIC)
1424             *dst++ = ISO_CODE_LF;
1425           else if (coding->eol_type == CODING_EOL_CRLF)
1426             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1427           else
1428             *dst++ = ISO_CODE_CR;
1429           CODING_SPEC_ISO_BOL (coding) = 1;
1430           break;
1431
1432         case EMACS_leading_code_2:
1433           ONE_MORE_BYTE (c2);
1434           ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2);
1435           break;
1436
1437         case EMACS_leading_code_3:
1438           TWO_MORE_BYTES (c2, c3);
1439           if (c1 < LEADING_CODE_PRIVATE_11)
1440             ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3);
1441           else
1442             ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3);
1443           break;
1444
1445         case EMACS_leading_code_4:
1446           THREE_MORE_BYTES (c2, c3, c4);
1447           ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4);
1448           break;
1449
1450         case EMACS_leading_code_composition:
1451           ONE_MORE_BYTE (c1);
1452           if (c1 == 0xFF)
1453             {
1454               coding->composing = COMPOSING_WITH_RULE_HEAD;
1455               ENCODE_COMPOSITION_WITH_RULE_START;
1456             }
1457           else
1458             {
1459               /* Rewind one byte because it is a character code of
1460                  composition elements.  */
1461               src--;
1462               coding->composing = COMPOSING_NO_RULE_HEAD;
1463               ENCODE_COMPOSITION_NO_RULE_START;
1464             }
1465           break;
1466
1467         case EMACS_invalid_code:
1468           *dst++ = c1;
1469           break;
1470         }
1471       continue;
1472     label_end_of_loop:
1473       coding->carryover_size = src - src_base;
1474       bcopy (src_base, coding->carryover, coding->carryover_size);
1475       src = src_base;
1476       break;
1477     }
1478
1479   /* If this is the last block of the text to be encoded, we must
1480      reset the state of graphic planes and registers to initial one.
1481      In addition, we had better just flush out all remaining codes in
1482      the text although they are not valid characters.  */
1483   if (coding->last_block)
1484     {
1485       ENCODE_RESET_PLANE_AND_REGISTER;
1486       bcopy(src, dst, src_end - src);
1487       dst += (src_end - src);
1488       src = src_end;
1489     }
1490   *consumed = src - source;
1491   return dst - destination;
1492 }
1493
1494 \f
1495 /*** 4. SJIS and BIG5 handlers ***/
1496
1497 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1498    quite widely.  So, for the moment, Emacs supports them in the bare
1499    C code.  But, in the future, they may be supported only by CCL.  */
1500
1501 /* SJIS is a coding system encoding three character sets: ASCII, right
1502    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1503    as is.  A character of charset katakana-jisx0201 is encoded by
1504    "position-code + 0x80".  A character of charset japanese-jisx0208
1505    is encoded in 2-byte but two position-codes are divided and shifted
1506    so that it fit in the range below.
1507
1508    --- CODE RANGE of SJIS ---
1509    (character set)      (range)
1510    ASCII                0x00 .. 0x7F
1511    KATAKANA-JISX0201    0xA0 .. 0xDF
1512    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1513             (2nd byte)  0x40 .. 0xFF
1514    -------------------------------
1515
1516 */
1517
1518 /* BIG5 is a coding system encoding two character sets: ASCII and
1519    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1520    character set and is encoded in two-byte.
1521
1522    --- CODE RANGE of BIG5 ---
1523    (character set)      (range)
1524    ASCII                0x00 .. 0x7F
1525    Big5 (1st byte)      0xA1 .. 0xFE
1526         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1527    --------------------------
1528
1529    Since the number of characters in Big5 is larger than maximum
1530    characters in Emacs' charset (96x96), it can't be handled as one
1531    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1532    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1533    contains frequently used characters and the latter contains less
1534    frequently used characters.  */
1535
1536 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1537    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1538    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1539    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1540
1541 /* Number of Big5 characters which have the same code in 1st byte.  */
1542 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1543
1544 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1545   do {                                                                  \
1546     unsigned int temp                                                   \
1547       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1548     if (b1 < 0xC9)                                                      \
1549       charset = charset_big5_1;                                         \
1550     else                                                                \
1551       {                                                                 \
1552         charset = charset_big5_2;                                       \
1553         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1554       }                                                                 \
1555     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1556     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1557   } while (0)
1558
1559 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1560   do {                                                                  \
1561     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1562     if (charset == charset_big5_2)                                      \
1563       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1564     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1565     b2 = temp % BIG5_SAME_ROW;                                          \
1566     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1567   } while (0)
1568
1569 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1570    Check if a text is encoded in SJIS.  If it is, return
1571    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1572
1573 int
1574 detect_coding_sjis (src, src_end)
1575      unsigned char *src, *src_end;
1576 {
1577   unsigned char c;
1578
1579   while (src < src_end)
1580     {
1581       c = *src++;
1582       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1583         return 0;
1584       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1585         {
1586           if (src < src_end && *src++ < 0x40)
1587             return 0;
1588         }
1589     }
1590   return CODING_CATEGORY_MASK_SJIS;
1591 }
1592
1593 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1594    Check if a text is encoded in BIG5.  If it is, return
1595    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1596
1597 int
1598 detect_coding_big5 (src, src_end)
1599      unsigned char *src, *src_end;
1600 {
1601   unsigned char c;
1602
1603   while (src < src_end)
1604     {
1605       c = *src++;
1606       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1607         return 0;
1608       if (c >= 0xA1)
1609         {
1610           if (src >= src_end)
1611             break;
1612           c = *src++;
1613           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1614             return 0;
1615         }
1616     }
1617   return CODING_CATEGORY_MASK_BIG5;
1618 }
1619
1620 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1621    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1622
1623 int
1624 decode_coding_sjis_big5 (coding, source, destination,
1625                          src_bytes, dst_bytes, consumed, sjis_p)
1626      struct coding_system *coding;
1627      unsigned char *source, *destination;
1628      int src_bytes, dst_bytes;
1629      int *consumed;
1630      int sjis_p;
1631 {
1632   unsigned char *src = source;
1633   unsigned char *src_end = source + src_bytes;
1634   unsigned char *dst = destination;
1635   unsigned char *dst_end = destination + dst_bytes;
1636   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1637      from DST_END to assure overflow checking is necessary only at the
1638      head of loop.  */
1639   unsigned char *adjusted_dst_end = dst_end - 3;
1640
1641   while (src < src_end && dst < adjusted_dst_end)
1642     {
1643       /* SRC_BASE remembers the start position in source in each loop.
1644          The loop will be exited when there's not enough source text
1645          to analyze two-byte character (within macro ONE_MORE_BYTE).
1646          In that case, SRC is reset to SRC_BASE before exiting.  */
1647       unsigned char *src_base = src;
1648       unsigned char c1 = *src++, c2, c3, c4;
1649
1650       if (c1 == '\r')
1651         {
1652           if (coding->eol_type == CODING_EOL_CRLF)
1653             {
1654               ONE_MORE_BYTE (c2);
1655               if (c2 == '\n')
1656                 *dst++ = c2;
1657               else
1658                 /* To process C2 again, SRC is subtracted by 1.  */
1659                 *dst++ = c1, src--;
1660             }
1661           else
1662             *dst++ = c1;
1663         }
1664       else if (c1 < 0x80)
1665         *dst++ = c1;
1666       else if (c1 < 0xA0 || c1 >= 0xE0)
1667         {
1668           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1669           if (sjis_p)
1670             {
1671               ONE_MORE_BYTE (c2);
1672               DECODE_SJIS (c1, c2, c3, c4);
1673               DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1674             }
1675           else if (c1 >= 0xE0 && c1 < 0xFF)
1676             {
1677               int charset;
1678
1679               ONE_MORE_BYTE (c2);
1680               DECODE_BIG5 (c1, c2, charset, c3, c4);
1681               DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1682             }
1683           else                  /* Invalid code */
1684             *dst++ = c1;
1685         }
1686       else
1687         {
1688           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1689           if (sjis_p)
1690             DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1691           else
1692             {
1693               int charset;
1694
1695               ONE_MORE_BYTE (c2);
1696               DECODE_BIG5 (c1, c2, charset, c3, c4);
1697               DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1698             }
1699         }
1700       continue;
1701
1702     label_end_of_loop:
1703       coding->carryover_size = src - src_base;
1704       bcopy (src_base, coding->carryover, coding->carryover_size);
1705       src = src_base;
1706       break;
1707     }
1708
1709   *consumed = src - source;
1710   return dst - destination;
1711 }
1712
1713 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1714    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1715    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1716    sure that all these charsets are registered as official charset
1717    (i.e. do not have extended leading-codes).  Characters of other
1718    charsets are produced without any encoding.  If SJIS_P is 1, encode
1719    SJIS text, else encode BIG5 text.  */
1720
1721 int
1722 encode_coding_sjis_big5 (coding, source, destination,
1723                          src_bytes, dst_bytes, consumed, sjis_p)
1724      struct coding_system *coding;
1725      unsigned char *source, *destination;
1726      int src_bytes, dst_bytes;
1727      int *consumed;
1728      int sjis_p;
1729 {
1730   unsigned char *src = source;
1731   unsigned char *src_end = source + src_bytes;
1732   unsigned char *dst = destination;
1733   unsigned char *dst_end = destination + dst_bytes;
1734   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1735      from DST_END to assure overflow checking is necessary only at the
1736      head of loop.  */
1737   unsigned char *adjusted_dst_end = dst_end - 1;
1738
1739   while (src < src_end && dst < adjusted_dst_end)
1740     {
1741       /* SRC_BASE remembers the start position in source in each loop.
1742          The loop will be exited when there's not enough source text
1743          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1744          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1745          before exiting.  */
1746       unsigned char *src_base = src;
1747       unsigned char c1 = *src++, c2, c3, c4;
1748
1749       if (coding->composing)
1750         {
1751           if (c1 == 0xA0)
1752             {
1753               ONE_MORE_BYTE (c1);
1754               c1 &= 0x7F;
1755             }
1756           else if (c1 >= 0xA0)
1757             c1 -= 0x20;
1758           else
1759             coding->composing = 0;
1760         }
1761
1762       switch (emacs_code_class[c1])
1763         {
1764         case EMACS_ascii_code:
1765         case EMACS_control_code:
1766           *dst++ = c1;
1767           break;
1768
1769         case EMACS_carriage_return_code:
1770           if (!coding->selective)
1771             {
1772               *dst++ = c1;
1773               break;
1774             }
1775           /* fall down to treat '\r' as '\n' ...  */
1776
1777         case EMACS_linefeed_code:
1778           if (coding->eol_type == CODING_EOL_LF
1779               || coding->eol_type == CODING_EOL_AUTOMATIC)
1780             *dst++ = '\n';
1781           else if (coding->eol_type == CODING_EOL_CRLF)
1782             *dst++ = '\r', *dst++ = '\n';
1783           else
1784             *dst++ = '\r';
1785           break;
1786
1787         case EMACS_leading_code_2:
1788           ONE_MORE_BYTE (c2);
1789           if (sjis_p && c1 == charset_katakana_jisx0201)
1790             *dst++ = c2;
1791           else
1792             *dst++ = c1, *dst++ = c2;
1793           break;
1794
1795         case EMACS_leading_code_3:
1796           TWO_MORE_BYTES (c2, c3);
1797           c2 &= 0x7F, c3 &= 0x7F;
1798           if (sjis_p && c1 == charset_jisx0208)
1799             {
1800               unsigned char s1, s2;
1801
1802               ENCODE_SJIS (c2, c3, s1, s2);
1803               *dst++ = s1, *dst++ = s2;
1804             }
1805           else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1806             {
1807               unsigned char b1, b2;
1808
1809               ENCODE_BIG5 (c1, c2, c3, b1, b2);
1810               *dst++ = b1, *dst++ = b2;
1811             }
1812           else
1813             *dst++ = c1, *dst++ = c2, *dst++ = c3;
1814           break;
1815
1816         case EMACS_leading_code_4:
1817           THREE_MORE_BYTES (c2, c3, c4);
1818           *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1819           break;
1820
1821         case EMACS_leading_code_composition:
1822           coding->composing = 1;
1823           break;
1824
1825         default:                /* i.e. case EMACS_invalid_code: */
1826           *dst++ = c1;
1827         }
1828       continue;
1829
1830     label_end_of_loop:
1831       coding->carryover_size = src - src_base;
1832       bcopy (src_base, coding->carryover, coding->carryover_size);
1833       src = src_base;
1834       break;
1835     }
1836
1837   *consumed = src - source;
1838   return dst - destination;
1839 }
1840
1841 \f
1842 /*** 5. End-of-line handlers ***/
1843
1844 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1845    This function is called only when `coding->eol_type' is
1846    CODING_EOL_CRLF or CODING_EOL_CR.  */
1847
1848 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1849      struct coding_system *coding;
1850      unsigned char *source, *destination;
1851      int src_bytes, dst_bytes;
1852      int *consumed;
1853 {
1854   unsigned char *src = source;
1855   unsigned char *src_end = source + src_bytes;
1856   unsigned char *dst = destination;
1857   unsigned char *dst_end = destination + dst_bytes;
1858   int produced;
1859
1860   switch (coding->eol_type)
1861     {
1862     case CODING_EOL_CRLF:
1863       {
1864         /* Since the maximum bytes produced by each loop is 2, we
1865            subtract 1 from DST_END to assure overflow checking is
1866            necessary only at the head of loop.  */
1867         unsigned char *adjusted_dst_end = dst_end - 1;
1868
1869         while (src < src_end && dst < adjusted_dst_end)
1870           {
1871             unsigned char *src_base = src;
1872             unsigned char c = *src++;
1873             if (c == '\r')
1874               {
1875                 ONE_MORE_BYTE (c);
1876                 if (c != '\n')
1877                   *dst++ = '\r';
1878                 *dst++ = c;
1879               }
1880             else
1881               *dst++ = c;
1882             continue;
1883
1884           label_end_of_loop:
1885             coding->carryover_size = src - src_base;
1886             bcopy (src_base, coding->carryover, coding->carryover_size);
1887             src = src_base;
1888             break;
1889           }
1890         *consumed = src - source;
1891         produced = dst - destination;
1892         break;
1893       }
1894
1895     case CODING_EOL_CR:
1896       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1897       bcopy (source, destination, produced);
1898       dst_end = destination + produced;
1899       while (dst < dst_end)
1900         if (*dst++ == '\r') dst[-1] = '\n';
1901       *consumed = produced;
1902       break;
1903
1904     default:                    /* i.e. case: CODING_EOL_LF */
1905       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1906       bcopy (source, destination, produced);
1907       *consumed = produced;
1908       break;
1909     }
1910
1911   return produced;
1912 }
1913
1914 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
1915    format of end-of-line according to `coding->eol_type'.  If
1916    `coding->selective' is 1, code '\r' in source text also means
1917    end-of-line.  */
1918
1919 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1920      struct coding_system *coding;
1921      unsigned char *source, *destination;
1922      int src_bytes, dst_bytes;
1923      int *consumed;
1924 {
1925   unsigned char *src = source;
1926   unsigned char *dst = destination;
1927   int produced;
1928
1929   if (src_bytes <= 0)
1930     return 0;
1931
1932   switch (coding->eol_type)
1933     {
1934     case CODING_EOL_LF:
1935     case CODING_EOL_AUTOMATIC:
1936       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1937       bcopy (source, destination, produced);
1938       if (coding->selective)
1939         {
1940           int i = produced;
1941           while (i--)
1942             if (*dst++ == '\r') dst[-1] = '\n';
1943         }
1944       *consumed = produced;
1945
1946     case CODING_EOL_CRLF:
1947       {
1948         unsigned char c;
1949         unsigned char *src_end = source + src_bytes;
1950         unsigned char *dst_end = destination + dst_bytes;
1951         /* Since the maximum bytes produced by each loop is 2, we
1952            subtract 1 from DST_END to assure overflow checking is
1953            necessary only at the head of loop.  */
1954         unsigned char *adjusted_dst_end = dst_end - 1;
1955
1956         while (src < src_end && dst < adjusted_dst_end)
1957           {
1958             c = *src++;
1959             if (c == '\n' || (c == '\r' && coding->selective))
1960               *dst++ = '\r', *dst++ = '\n';
1961             else
1962               *dst++ = c;
1963           }
1964         produced = dst - destination;
1965         *consumed = src - source;
1966         break;
1967       }
1968
1969     default:                    /* i.e. case CODING_EOL_CR: */
1970       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1971       bcopy (source, destination, produced);
1972       {
1973         int i = produced;
1974         while (i--)
1975           if (*dst++ == '\n') dst[-1] = '\r';
1976       }
1977       *consumed = produced;
1978     }
1979
1980   return produced;
1981 }
1982
1983 \f
1984 /*** 6. C library functions ***/
1985
1986 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
1987    has a property `coding-system'.  The value of this property is a
1988    vector of length 5 (called as coding-vector).  Among elements of
1989    this vector, the first (element[0]) and the fifth (element[4])
1990    carry important information for decoding/encoding.  Before
1991    decoding/encoding, this information should be set in fields of a
1992    structure of type `coding_system'.
1993
1994    A value of property `coding-system' can be a symbol of another
1995    subsidiary coding-system.  In that case, Emacs gets coding-vector
1996    from that symbol.
1997
1998    `element[0]' contains information to be set in `coding->type'.  The
1999    value and its meaning is as follows:
2000
2001    0 -- coding_system_internal
2002    1 -- coding_system_sjis
2003    2 -- coding_system_iso2022
2004    3 -- coding_system_big5
2005    4 -- coding_system_ccl
2006    nil -- coding_system_no_conversion
2007    t -- coding_system_automatic
2008
2009    `element[4]' contains information to be set in `coding->flags' and
2010    `coding->spec'.  The meaning varies by `coding->type'.
2011
2012    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2013    of length 32 (of which the first 13 sub-elements are used now).
2014    Meanings of these sub-elements are:
2015
2016    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2017         If the value is an integer of valid charset, the charset is
2018         assumed to be designated to graphic register N initially.
2019
2020         If the value is minus, it is a minus value of charset which
2021         reserves graphic register N, which means that the charset is
2022         not designated initially but should be designated to graphic
2023         register N just before encoding a character in that charset.
2024
2025         If the value is nil, graphic register N is never used on
2026         encoding.
2027
2028    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2029         Each value takes t or nil.  See the section ISO2022 of
2030         `coding.h' for more information.
2031
2032    If `coding->type' is `coding_type_big5', element[4] is t to denote
2033    BIG5-ETen or nil to denote BIG5-HKU.
2034
2035    If `coding->type' takes the other value, element[4] is ignored.
2036
2037    Emacs Lisp's coding system also carries information about format of
2038    end-of-line in a value of property `eol-type'.  If the value is
2039    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2040    means CODING_EOL_CR.  If it is not integer, it should be a vector
2041    of subsidiary coding systems of which property `eol-type' has one
2042    of above values.
2043
2044 */
2045
2046 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2047    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2048    is setup so that no conversion is necessary and return -1, else
2049    return 0.  */
2050
2051 int
2052 setup_coding_system (coding_system, coding)
2053      Lisp_Object coding_system;
2054      struct coding_system *coding;
2055 {
2056   Lisp_Object type, eol_type;
2057
2058   /* At first, set several fields default values.  */
2059   coding->require_flushing = 0;
2060   coding->last_block = 0;
2061   coding->selective = 0;
2062   coding->composing = 0;
2063   coding->direction = 0;
2064   coding->carryover_size = 0;
2065   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2066
2067   Vlast_coding_system_used = coding->symbol = coding_system;
2068   eol_type = Qnil;
2069   /* Get value of property `coding-system' until we get a vector.
2070      While doing that, also get values of properties
2071      `post-read-conversion', `pre-write-conversion', and `eol-type'.  */
2072   while (!NILP (coding_system) && SYMBOLP (coding_system))
2073     {
2074       if (NILP (coding->post_read_conversion))
2075         coding->post_read_conversion = Fget (coding_system,
2076                                              Qpost_read_conversion);
2077       if (NILP (coding->pre_write_conversion))
2078         coding->pre_write_conversion = Fget (coding_system,
2079                                              Qpre_write_conversion);
2080       if (NILP (eol_type))
2081         eol_type = Fget (coding_system, Qeol_type);
2082       coding_system = Fget (coding_system, Qcoding_system);
2083     }
2084   if (!VECTORP (coding_system)
2085       || XVECTOR (coding_system)->size != 5)
2086     goto label_invalid_coding_system;
2087
2088   if (VECTORP (eol_type))
2089     coding->eol_type = CODING_EOL_AUTOMATIC;
2090   else if (XFASTINT (eol_type) == 1)
2091     coding->eol_type = CODING_EOL_CRLF;
2092   else if (XFASTINT (eol_type) == 2)
2093     coding->eol_type = CODING_EOL_CR;
2094   else
2095     coding->eol_type = CODING_EOL_LF;
2096
2097   type = XVECTOR (coding_system)->contents[0];
2098   switch (XFASTINT (type))
2099     {
2100     case 0:
2101       coding->type = coding_type_internal;
2102       break;
2103
2104     case 1:
2105       coding->type = coding_type_sjis;
2106       break;
2107
2108     case 2:
2109       coding->type = coding_type_iso2022;
2110       {
2111         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2112         Lisp_Object *flags;
2113         int i, charset, default_reg_bits = 0;
2114
2115         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2116           goto label_invalid_coding_system;
2117
2118         flags = XVECTOR (val)->contents;
2119         coding->flags
2120           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2121              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2122              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2123              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2124              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2125              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2126              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2127              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2128              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2129              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2130              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2131
2132         /* Invoke graphic register 0 to plane 0.  */
2133         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2134         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2135         CODING_SPEC_ISO_INVOCATION (coding, 1)
2136           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2137         /* Not single shifting at first.  */
2138         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2139         /* Beginning of buffer should also be regarded as bol. */
2140         CODING_SPEC_ISO_BOL(coding) = 1;
2141
2142         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2143            FLAGS[REG] can be one of below:
2144                 integer CHARSET: CHARSET occupies register I,
2145                 t: designate nothing to REG initially, but can be used
2146                   by any charsets,
2147                 list of integer, nil, or t: designate the first
2148                   element (if integer) to REG initially, the remaining
2149                   elements (if integer) is designated to REG on request,
2150                   if an element is t, REG can be used by any charset,
2151                 nil: REG is never used.  */
2152         for (charset = 0; charset <= MAX_CHARSET; charset++)
2153           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2154         for (i = 0; i < 4; i++)
2155           {
2156             if (INTEGERP (flags[i])
2157                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2158                 || (charset = get_charset_id (flags[i])) >= 0)
2159               {
2160                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2161                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2162               }
2163             else if (EQ (flags[i], Qt))
2164               {
2165                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2166                 default_reg_bits |= 1 << i;
2167               }
2168             else if (CONSP (flags[i]))
2169               {
2170                 Lisp_Object tail = flags[i];
2171
2172                 if (INTEGERP (XCONS (tail)->car)
2173                     && (charset = XINT (XCONS (tail)->car),
2174                         CHARSET_VALID_P (charset))
2175                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2176                   {
2177                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2178                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2179                   }
2180                 else
2181                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2182                 tail = XCONS (tail)->cdr;
2183                 while (CONSP (tail))
2184                   {
2185                     if (INTEGERP (XCONS (tail)->car)
2186                         && (charset = XINT (XCONS (tail)->car),
2187                             CHARSET_VALID_P (charset))
2188                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2189                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2190                         = i;
2191                     else if (EQ (XCONS (tail)->car, Qt))
2192                       default_reg_bits |= 1 << i;
2193                     tail = XCONS (tail)->cdr;
2194                   }
2195               }
2196             else
2197               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2198
2199             CODING_SPEC_ISO_DESIGNATION (coding, i)
2200               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2201           }
2202
2203         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2204           {
2205             /* REG 1 can be used only by locking shift in 7-bit env.  */
2206             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2207               default_reg_bits &= ~2;
2208             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2209               /* Without any shifting, only REG 0 and 1 can be used.  */
2210               default_reg_bits &= 3;
2211           }
2212
2213         for (charset = 0; charset <= MAX_CHARSET; charset++)
2214           if (CHARSET_VALID_P (charset)
2215               && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2216             {
2217               /* We have not yet decided where to designate CHARSET.  */
2218               int reg_bits = default_reg_bits;
2219
2220               if (CHARSET_CHARS (charset) == 96)
2221                 /* A charset of CHARS96 can't be designated to REG 0.  */
2222                 reg_bits &= ~1;
2223
2224               if (reg_bits)
2225                 /* There exist some default graphic register.  */
2226                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2227                   = (reg_bits & 1
2228                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2229               else
2230                 /* We anyway have to designate CHARSET to somewhere.  */
2231                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2232                   = (CHARSET_CHARS (charset) == 94
2233                      ? 0
2234                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2235                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2236                         ? 1
2237                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2238                            ? 2 : 0)));
2239             }
2240       }
2241       coding->require_flushing = 1;
2242       break;
2243
2244     case 3:
2245       coding->type = coding_type_big5;
2246       coding->flags
2247         = (NILP (XVECTOR (coding_system)->contents[4])
2248            ? CODING_FLAG_BIG5_HKU
2249            : CODING_FLAG_BIG5_ETEN);
2250       break;
2251
2252     case 4:
2253       coding->type = coding_type_ccl;
2254       {
2255         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2256         if (CONSP  (val)
2257             && VECTORP (XCONS (val)->car)
2258             && VECTORP (XCONS (val)->cdr))
2259           {
2260             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2261             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2262           }
2263         else
2264           goto label_invalid_coding_system;
2265       }
2266       coding->require_flushing = 1;
2267       break;
2268
2269     default:
2270       if (EQ (type, Qt))
2271         coding->type = coding_type_automatic;
2272       else
2273         coding->type = coding_type_no_conversion;
2274       break;
2275     }
2276   return 0;
2277
2278  label_invalid_coding_system:
2279   coding->type = coding_type_no_conversion;
2280   coding->eol_type = CODING_EOL_LF;
2281   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2282     = Qnil;
2283   return -1;
2284 }
2285
2286 /* Emacs has a mechanism to automatically detect a coding system if it
2287    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2288    it's impossible to distinguish some coding systems accurately
2289    because they use the same range of codes.  So, at first, coding
2290    systems are categorized into 7, those are:
2291
2292    o coding-category-internal
2293
2294         The category for a coding system which has the same code range
2295         as Emacs' internal format.  Assigned the coding-system (Lisp
2296         symbol) `internal' by default.
2297
2298    o coding-category-sjis
2299
2300         The category for a coding system which has the same code range
2301         as SJIS.  Assigned the coding-system (Lisp
2302         symbol) `shift-jis' by default.
2303
2304    o coding-category-iso-7
2305
2306         The category for a coding system which has the same code range
2307         as ISO2022 of 7-bit environment.  Assigned the coding-system
2308         (Lisp symbol) `iso-2022-7' by default.
2309
2310    o coding-category-iso-8-1
2311
2312         The category for a coding system which has the same code range
2313         as ISO2022 of 8-bit environment and graphic plane 1 used only
2314         for DIMENSION1 charset.  Assigned the coding-system (Lisp
2315         symbol) `iso-8859-1' by default.
2316
2317    o coding-category-iso-8-2
2318
2319         The category for a coding system which has the same code range
2320         as ISO2022 of 8-bit environment and graphic plane 1 used only
2321         for DIMENSION2 charset.  Assigned the coding-system (Lisp
2322         symbol) `euc-japan' by default.
2323
2324    o coding-category-iso-else
2325
2326         The category for a coding system which has the same code range
2327         as ISO2022 but not belongs to any of the above three
2328         categories.  Assigned the coding-system (Lisp symbol)
2329         `iso-2022-ss2-7' by default.
2330
2331    o coding-category-big5
2332
2333         The category for a coding system which has the same code range
2334         as BIG5.  Assigned the coding-system (Lisp symbol)
2335         `cn-big5' by default.
2336
2337    o coding-category-binary
2338
2339         The category for a coding system not categorized in any of the
2340         above.  Assigned the coding-system (Lisp symbol)
2341         `no-conversion' by default.
2342
2343    Each of them is a Lisp symbol and the value is an actual
2344    `coding-system's (this is also a Lisp symbol) assigned by a user.
2345    What Emacs does actually is to detect a category of coding system.
2346    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2347    decide only one possible category, it selects a category of the
2348    highest priority.  Priorities of categories are also specified by a
2349    user in a Lisp variable `coding-category-list'.
2350
2351 */
2352
2353 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2354    If it detects possible coding systems, return an integer in which
2355    appropriate flag bits are set.  Flag bits are defined by macros
2356    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2357
2358 int
2359 detect_coding_mask (src, src_bytes)
2360      unsigned char *src;
2361      int src_bytes;
2362 {
2363   register unsigned char c;
2364   unsigned char *src_end = src + src_bytes;
2365   int mask;
2366
2367   /* At first, skip all ASCII characters and control characters except
2368      for three ISO2022 specific control characters.  */
2369  label_loop_detect_coding:
2370   while (src < src_end)
2371     {
2372       c = *src;
2373       if (c >= 0x80
2374           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2375         break;
2376       src++;
2377     }
2378
2379   if (src >= src_end)
2380     /* We found nothing other than ASCII.  There's nothing to do.  */
2381     return CODING_CATEGORY_MASK_ANY;
2382
2383   /* The text seems to be encoded in some multilingual coding system.
2384      Now, try to find in which coding system the text is encoded.  */
2385   if (c < 0x80)
2386     {
2387       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2388       /* C is an ISO2022 specific control code of C0.  */
2389       mask = detect_coding_iso2022 (src, src_end);
2390       src++;
2391       if (mask == CODING_CATEGORY_MASK_ANY)
2392         /* No valid ISO2022 code follows C.  Try again.  */
2393         goto label_loop_detect_coding;
2394     }
2395   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2396     /* C is an ISO2022 specific control code of C1,
2397        or the first byte of SJIS's 2-byte character code,
2398        or a leading code of Emacs.  */
2399     mask = (detect_coding_iso2022 (src, src_end)
2400             | detect_coding_sjis (src, src_end)
2401             | detect_coding_internal (src, src_end));
2402
2403   else if (c < 0xA0)
2404     /* C is the first byte of SJIS character code,
2405        or a leading-code of Emacs.  */
2406     mask = (detect_coding_sjis (src, src_end)
2407             | detect_coding_internal (src, src_end));
2408
2409   else
2410     /* C is a character of ISO2022 in graphic plane right,
2411        or a SJIS's 1-byte character code (i.e. JISX0201),
2412        or the first byte of BIG5's 2-byte code.  */
2413     mask = (detect_coding_iso2022 (src, src_end)
2414             | detect_coding_sjis (src, src_end)
2415             | detect_coding_big5 (src, src_end));
2416
2417   return mask;
2418 }
2419
2420 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2421    The information of the detected coding system is set in CODING.  */
2422
2423 void
2424 detect_coding (coding, src, src_bytes)
2425      struct coding_system *coding;
2426      unsigned char *src;
2427      int src_bytes;
2428 {
2429   int mask = detect_coding_mask (src, src_bytes);
2430   int idx;
2431
2432   if (mask == CODING_CATEGORY_MASK_ANY)
2433     /* We found nothing other than ASCII.  There's nothing to do.  */
2434     return;
2435
2436   if (!mask)
2437     /* The source text seems to be encoded in unknown coding system.
2438        Emacs regards the category of such a kind of coding system as
2439        `coding-category-binary'.  We assume that a user has assigned
2440        an appropriate coding system for a `coding-category-binary'.  */
2441     idx = CODING_CATEGORY_IDX_BINARY;
2442   else
2443     {
2444       /* We found some plausible coding systems.  Let's use a coding
2445          system of the highest priority.  */
2446       Lisp_Object val = Vcoding_category_list;
2447
2448       if (CONSP (val))
2449         while (!NILP (val))
2450           {
2451             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2452             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2453               break;
2454             val = XCONS (val)->cdr;
2455           }
2456       else
2457         val = Qnil;
2458
2459       if (NILP (val))
2460         {
2461           /* For unknown reason, `Vcoding_category_list' contains none
2462              of found categories.  Let's use any of them.  */
2463           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2464             if (mask & (1 << idx))
2465               break;
2466         }
2467     }
2468   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2469 }
2470
2471 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2472    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2473    CODING_EOL_CR, and CODING_EOL_AUTOMATIC.  */
2474
2475 int
2476 detect_eol_type (src, src_bytes)
2477      unsigned char *src;
2478      int src_bytes;
2479 {
2480   unsigned char *src_end = src + src_bytes;
2481   unsigned char c;
2482
2483   while (src < src_end)
2484     {
2485       c = *src++;
2486       if (c == '\n')
2487         return CODING_EOL_LF;
2488       else if (c == '\r')
2489         {
2490           if (src < src_end && *src == '\n')
2491             return CODING_EOL_CRLF;
2492           else
2493             return CODING_EOL_CR;
2494         }
2495     }
2496   return CODING_EOL_AUTOMATIC;
2497 }
2498
2499 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2500    is encoded.  If it detects an appropriate format of end-of-line, it
2501    sets the information in *CODING.  */
2502
2503 void
2504 detect_eol (coding, src, src_bytes)
2505      struct coding_system *coding;
2506      unsigned char *src;
2507      int src_bytes;
2508 {
2509   Lisp_Object val;
2510   int eol_type = detect_eol_type (src, src_bytes);
2511
2512   if (eol_type == CODING_EOL_AUTOMATIC)
2513     /*  We found no end-of-line in the source text.  */
2514     return;
2515
2516   val = Fget (coding->symbol, Qeol_type);
2517   if (VECTORP (val) && XVECTOR (val)->size == 3)
2518     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2519 }
2520
2521 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2522    decoding, it may detect coding system and format of end-of-line if
2523    those are not yet decided.  */
2524
2525 int
2526 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2527      struct coding_system *coding;
2528      unsigned char *source, *destination;
2529      int src_bytes, dst_bytes;
2530      int *consumed;
2531 {
2532   int produced;
2533
2534   if (src_bytes <= 0)
2535     {
2536       *consumed = 0;
2537       return 0;
2538     }
2539
2540   if (coding->type == coding_type_automatic)
2541     detect_coding (coding, source, src_bytes);
2542
2543   if (coding->eol_type == CODING_EOL_AUTOMATIC)
2544     detect_eol (coding, source, src_bytes);
2545
2546   coding->carryover_size = 0;
2547   switch (coding->type)
2548     {
2549     case coding_type_no_conversion:
2550     label_no_conversion:
2551       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2552       bcopy (source, destination, produced);
2553       *consumed = produced;
2554       break;
2555
2556     case coding_type_internal:
2557     case coding_type_automatic:
2558       if (coding->eol_type == CODING_EOL_LF
2559           ||  coding->eol_type == CODING_EOL_AUTOMATIC)
2560         goto label_no_conversion;
2561       produced = decode_eol (coding, source, destination,
2562                              src_bytes, dst_bytes, consumed);
2563       break;
2564
2565     case coding_type_sjis:
2566       produced = decode_coding_sjis_big5 (coding, source, destination,
2567                                           src_bytes, dst_bytes, consumed,
2568                                           1);
2569       break;
2570
2571     case coding_type_iso2022:
2572       produced = decode_coding_iso2022 (coding, source, destination,
2573                                         src_bytes, dst_bytes, consumed);
2574       break;
2575
2576     case coding_type_big5:
2577       produced = decode_coding_sjis_big5 (coding, source, destination,
2578                                           src_bytes, dst_bytes, consumed,
2579                                           0);
2580       break;
2581
2582     case coding_type_ccl:
2583       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2584                              src_bytes, dst_bytes, consumed);
2585       break;
2586     }
2587
2588   return produced;
2589 }
2590
2591 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2592
2593 int
2594 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2595      struct coding_system *coding;
2596      unsigned char *source, *destination;
2597      int src_bytes, dst_bytes;
2598      int *consumed;
2599 {
2600   int produced;
2601
2602   coding->carryover_size = 0;
2603   switch (coding->type)
2604     {
2605     case coding_type_no_conversion:
2606     label_no_conversion:
2607       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2608       if (produced > 0)
2609         {
2610           bcopy (source, destination, produced);
2611           if (coding->selective)
2612             {
2613               unsigned char *p = destination, *pend = destination + produced;
2614               while (p < pend)
2615                 if (*p++ == '\015') p[-1] = '\n';
2616             }
2617         }
2618       *consumed = produced;
2619       break;
2620
2621     case coding_type_internal:
2622     case coding_type_automatic:
2623       if (coding->eol_type == CODING_EOL_LF
2624           ||  coding->eol_type == CODING_EOL_AUTOMATIC)
2625         goto label_no_conversion;
2626       produced = encode_eol (coding, source, destination,
2627                              src_bytes, dst_bytes, consumed);
2628       break;
2629
2630     case coding_type_sjis:
2631       produced = encode_coding_sjis_big5 (coding, source, destination,
2632                                           src_bytes, dst_bytes, consumed,
2633                                           1);
2634       break;
2635
2636     case coding_type_iso2022:
2637       produced = encode_coding_iso2022 (coding, source, destination,
2638                                         src_bytes, dst_bytes, consumed);
2639       break;
2640
2641     case coding_type_big5:
2642       produced = encode_coding_sjis_big5 (coding, source, destination,
2643                                           src_bytes, dst_bytes, consumed,
2644                                           0);
2645       break;
2646
2647     case coding_type_ccl:
2648       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2649                              src_bytes, dst_bytes, consumed);
2650       break;
2651     }
2652
2653   return produced;
2654 }
2655
2656 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2657
2658 /* Return maximum size (bytes) of a buffer enough for decoding
2659    SRC_BYTES of text encoded in CODING.  */
2660
2661 int
2662 decoding_buffer_size (coding, src_bytes)
2663      struct coding_system *coding;
2664      int src_bytes;
2665 {
2666   int magnification;
2667
2668   if (coding->type == coding_type_iso2022)
2669     magnification = 3;
2670   else if (coding->type == coding_type_ccl)
2671     magnification = coding->spec.ccl.decoder.buf_magnification;
2672   else
2673     magnification = 2;
2674
2675   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2676 }
2677
2678 /* Return maximum size (bytes) of a buffer enough for encoding
2679    SRC_BYTES of text to CODING.  */
2680
2681 int
2682 encoding_buffer_size (coding, src_bytes)
2683      struct coding_system *coding;
2684      int src_bytes;
2685 {
2686   int magnification;
2687
2688   if (coding->type == coding_type_ccl)
2689     magnification = coding->spec.ccl.encoder.buf_magnification;
2690   else
2691     magnification = 3;
2692
2693   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2694 }
2695
2696 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2697 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2698 #endif
2699
2700 char *conversion_buffer;
2701 int conversion_buffer_size;
2702
2703 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2704    or decoding.  Sufficient memory is allocated automatically.  If we
2705    run out of memory, return NULL.  */
2706
2707 char *
2708 get_conversion_buffer (size)
2709      int size;
2710 {
2711   if (size > conversion_buffer_size)
2712     {
2713       char *buf;
2714       int real_size = conversion_buffer_size * 2;
2715
2716       while (real_size < size) real_size *= 2;
2717       buf = (char *) xmalloc (real_size);
2718       xfree (conversion_buffer);
2719       conversion_buffer = buf;
2720       conversion_buffer_size = real_size;
2721     }
2722   return conversion_buffer;
2723 }
2724
2725 \f
2726 #ifdef emacs
2727 /*** 7. Emacs Lisp library functions ***/
2728
2729 DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2730        1, 1, 0,
2731   "Return coding-vector of CODING-SYSTEM.\n\
2732 If CODING-SYSTEM is not a valid coding-system, return nil.")
2733   (obj)
2734      Lisp_Object obj;
2735 {
2736   while (SYMBOLP (obj) && !NILP (obj))
2737     obj = Fget (obj, Qcoding_system);
2738   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2739           ? Qnil : obj);
2740 }
2741
2742 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2743   "Return t if OBJECT is nil or a coding-system.\n\
2744 See document of make-coding-system for coding-system object.")
2745   (obj)
2746      Lisp_Object obj;
2747 {
2748   return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2749 }
2750
2751 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2752        Sread_non_nil_coding_system, 1, 1, 0,
2753   "Read a coding system from the minibuffer, prompting with string PROMPT.")
2754   (prompt)
2755      Lisp_Object prompt;
2756 {
2757   Lisp_Object val;
2758   do
2759     {
2760       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2761                               Qt, Qnil, Qnil, Qnil);
2762     }
2763   while (XSTRING (val)->size == 0);
2764   return (Fintern (val, Qnil));
2765 }
2766
2767 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2768   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2769   (prompt)
2770      Lisp_Object prompt;
2771 {
2772   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2773                                       Qt, Qnil, Qnil, Qnil);
2774   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2775 }
2776
2777 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2778        1, 1, 0,
2779   "Check validity of CODING-SYSTEM.\n\
2780 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2781 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2782 The value of property should be a vector of length 5.")
2783   (coding_system)
2784      Lisp_Object coding_system;
2785 {
2786   CHECK_SYMBOL (coding_system, 0);
2787   if (!NILP (Fcoding_system_p (coding_system)))
2788     return coding_system;
2789   while (1)
2790     Fsignal (Qcoding_system_error, coding_system);
2791 }
2792
2793 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2794        2, 2, 0,
2795   "Detect coding-system of the text in the region between START and END.\n\
2796 Return a list of possible coding-systems ordered by priority.\n\
2797 If only ASCII characters are found, it returns `automatic-conversion'\n\
2798  or its subsidiary coding-system according to a detected end-of-line format.")
2799   (b, e)
2800      Lisp_Object b, e;
2801 {
2802   int coding_mask, eol_type;
2803   Lisp_Object val;
2804   int beg, end;
2805
2806   validate_region (&b, &e);
2807   beg = XINT (b), end = XINT (e);
2808   if (beg < GPT && end >= GPT) move_gap (end);
2809
2810   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2811   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
2812
2813   if (coding_mask == CODING_CATEGORY_MASK_ANY)
2814     {
2815       val = intern ("automatic-conversion");
2816       if (eol_type != CODING_EOL_AUTOMATIC)
2817         {
2818           Lisp_Object val2 = Fget (val, Qeol_type);
2819           if (VECTORP (val2))
2820             val = XVECTOR (val2)->contents[eol_type];
2821         }
2822     }
2823   else
2824     {
2825       Lisp_Object val2;
2826
2827       /* At first, gather possible coding-systems in VAL in a reverse
2828          order.  */
2829       val = Qnil;
2830       for (val2 = Vcoding_category_list;
2831            !NILP (val2);
2832            val2 = XCONS (val2)->cdr)
2833         {
2834           int idx
2835             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2836           if (coding_mask & (1 << idx))
2837             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2838         }
2839
2840       /* Then, change the order of the list, while getting subsidiary
2841          coding-systems.  */
2842       val2 = val;
2843       val = Qnil;
2844       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2845         {
2846           if (eol_type == CODING_EOL_AUTOMATIC)
2847             val = Fcons (XCONS (val2)->car, val);
2848           else
2849             {
2850               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2851               if (VECTORP (val3))
2852                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2853               else
2854                 val = Fcons (XCONS (val2)->car, val);
2855             }
2856         }
2857     }
2858
2859   return val;
2860 }
2861
2862 /* Scan text in the region between *BEGP and *ENDP, skip characters
2863    which we never have to encode to (iff ENCODEP is 1) or decode from
2864    coding system CODING at the head and tail, then set BEGP and ENDP
2865    to the addresses of start and end of the text we actually convert.  */
2866
2867 void
2868 shrink_conversion_area (begp, endp, coding, encodep)
2869      unsigned char **begp, **endp;
2870      struct coding_system *coding;
2871      int encodep;
2872 {
2873   register unsigned char *beg_addr = *begp, *end_addr = *endp;
2874
2875   if (coding->eol_type != CODING_EOL_LF
2876       && coding->eol_type != CODING_EOL_AUTOMATIC)
2877     /* Since we anyway have to convert end-of-line format, it is not
2878        worth skipping at most 100 bytes or so.  */
2879     return;
2880
2881   if (encodep)                  /* for encoding */
2882     {
2883       switch (coding->type)
2884         {
2885         case coding_type_no_conversion:
2886         case coding_type_internal:
2887         case coding_type_automatic:
2888           /* We need no conversion.  */
2889           *begp = *endp;
2890           return;
2891         case coding_type_ccl:
2892           /* We can't skip any data.  */
2893           return;
2894         case coding_type_iso2022:
2895           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2896             {
2897               unsigned char *bol = beg_addr;
2898               while (beg_addr < end_addr && *beg_addr < 0x80)
2899                 {
2900                   beg_addr++;
2901                   if (*(beg_addr - 1) == '\n')
2902                     bol = beg_addr;
2903                 }
2904               beg_addr = bol;
2905               goto label_skip_tail;
2906             }
2907           /* fall down ... */
2908         default:
2909           /* We can skip all ASCII characters at the head and tail.  */
2910           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2911         label_skip_tail:
2912           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2913           break;
2914         }
2915     }
2916   else                          /* for decoding */
2917     {
2918       switch (coding->type)
2919         {
2920         case coding_type_no_conversion:
2921           /* We need no conversion.  */
2922           *begp = *endp;
2923           return;
2924         case coding_type_internal:
2925           if (coding->eol_type == CODING_EOL_LF)
2926             {
2927               /* We need no conversion.  */
2928               *begp = *endp;
2929               return;
2930             }
2931           /* We can skip all but carriage-return.  */
2932           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2933           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2934           break;
2935         case coding_type_sjis:
2936         case coding_type_big5:
2937           /* We can skip all ASCII characters at the head.  */
2938           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2939           /* We can skip all ASCII characters at the tail except for
2940              the second byte of SJIS or BIG5 code.  */
2941           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2942           if (end_addr != *endp)
2943             end_addr++;
2944           break;
2945         case coding_type_ccl:
2946           /* We can't skip any data.  */
2947           return;
2948         default:                /* i.e. case coding_type_iso2022: */
2949           {
2950             unsigned char c;
2951
2952             /* We can skip all ASCII characters except for a few
2953                control codes at the head.  */
2954             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2955                    && c != ISO_CODE_CR && c != ISO_CODE_SO
2956                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
2957               beg_addr++;
2958           }
2959           break;
2960         }
2961     }
2962   *begp = beg_addr;
2963   *endp = end_addr;
2964   return;
2965 }
2966
2967 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
2968    text between B and E.  B and E are buffer position.  */
2969
2970 Lisp_Object
2971 code_convert_region (b, e, coding, encodep)
2972      Lisp_Object b, e;
2973      struct coding_system *coding;
2974      int encodep;
2975 {
2976   int beg, end, len, consumed, produced;
2977   char *buf;
2978   unsigned char *begp, *endp;
2979   int pos = PT;
2980
2981   validate_region (&b, &e);
2982   beg = XINT (b), end = XINT (e);
2983   if (beg < GPT && end >= GPT)
2984     move_gap (end);
2985
2986   if (encodep && !NILP (coding->pre_write_conversion))
2987     {
2988       /* We must call a pre-conversion function which may put a new
2989          text to be converted in a new buffer.  */
2990       struct buffer *old = current_buffer, *new;
2991
2992       TEMP_SET_PT (beg);
2993       call2 (coding->pre_write_conversion, b, e);
2994       if (old != current_buffer)
2995         {
2996           /* Replace the original text by the text just generated.  */
2997           len = ZV - BEGV;
2998           new = current_buffer;
2999           set_buffer_internal (old);
3000           del_range (beg, end);
3001           insert_from_buffer (new, 1, len, 0);
3002           end = beg + len;
3003         }
3004     }
3005
3006   /* We may be able to shrink the conversion region.  */
3007   begp = POS_ADDR (beg); endp = begp + (end - beg);
3008   shrink_conversion_area (&begp, &endp, coding, encodep);
3009
3010   if (begp == endp)
3011     /* We need no conversion.  */
3012     len = end - beg;
3013   else
3014     {
3015       beg += begp - POS_ADDR (beg);
3016       end =  beg + (endp - begp);
3017
3018       if (encodep)
3019         len = encoding_buffer_size (coding, end - beg);
3020       else
3021         len = decoding_buffer_size (coding, end - beg);
3022       buf = get_conversion_buffer (len);
3023
3024       coding->last_block = 1;
3025       produced = (encodep
3026                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3027                                    &consumed)
3028                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3029                                    &consumed));
3030
3031       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3032
3033       TEMP_SET_PT (beg);
3034       insert (buf, produced);
3035       del_range (PT, PT + end - beg);
3036       if (pos >= end)
3037         pos = PT + (pos - end);
3038       else if (pos > beg)
3039         pos = beg;
3040       TEMP_SET_PT (pos);
3041   }
3042
3043   if (!encodep && !NILP (coding->post_read_conversion))
3044     {
3045       /* We must call a post-conversion function which may alter
3046          the text just converted.  */
3047       Lisp_Object insval;
3048
3049       beg = XINT (b);
3050       TEMP_SET_PT (beg);
3051       insval = call1 (coding->post_read_conversion, make_number (len));
3052       CHECK_NUMBER (insval, 0);
3053       len = XINT (insval);
3054     }
3055
3056   return make_number (len);
3057 }
3058
3059 Lisp_Object
3060 code_convert_string (str, coding, encodep, nocopy)
3061      Lisp_Object str, nocopy;
3062      struct coding_system *coding;
3063      int encodep;
3064 {
3065   int len, consumed, produced;
3066   char *buf;
3067   unsigned char *begp, *endp;
3068   int head_skip, tail_skip;
3069   struct gcpro gcpro1;
3070
3071   if (encodep && !NILP (coding->pre_write_conversion)
3072       || !encodep && !NILP (coding->post_read_conversion))
3073     {
3074       /* Since we have to call Lisp functions which assume target text
3075          is in a buffer, after setting a temporary buffer, call
3076          code_convert_region.  */
3077       int count = specpdl_ptr - specpdl;
3078       int len = XSTRING (str)->size;
3079       Lisp_Object result;
3080       struct buffer *old = current_buffer;
3081
3082       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3083       temp_output_buffer_setup (" *code-converting-work*");
3084       set_buffer_internal (XBUFFER (Vstandard_output));
3085       insert_from_string (str, 0, len, 0);
3086       code_convert_region (make_number (BEGV), make_number (ZV),
3087                            coding, encodep);
3088       result = make_buffer_string (BEGV, ZV, 0);
3089       set_buffer_internal (old);
3090       return unbind_to (count, result);
3091     }
3092
3093   /* We may be able to shrink the conversion region.  */
3094   begp = XSTRING (str)->data;
3095   endp = begp + XSTRING (str)->size;
3096   shrink_conversion_area (&begp, &endp, coding, encodep);
3097
3098   if (begp == endp)
3099     /* We need no conversion.  */
3100     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3101
3102   head_skip = begp - XSTRING (str)->data;
3103   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3104
3105   GCPRO1 (str);
3106
3107   if (encodep)
3108     len = encoding_buffer_size (coding, endp - begp);
3109   else
3110     len = decoding_buffer_size (coding, endp - begp);
3111   buf = get_conversion_buffer (len + head_skip + tail_skip);
3112
3113   bcopy (XSTRING (str)->data, buf, head_skip);
3114   coding->last_block = 1;
3115   produced = (encodep
3116               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3117                                buf + head_skip, endp - begp, len, &consumed)
3118               : decode_coding (coding, XSTRING (str)->data + head_skip,
3119                                buf + head_skip, endp - begp, len, &consumed));
3120   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3121          buf + head_skip + produced,
3122          tail_skip);
3123
3124   UNGCPRO;
3125
3126   return make_string (buf, head_skip + produced + tail_skip);
3127 }
3128
3129 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3130        3, 3, "r\nzCoding system: ",
3131   "Decode current region by specified coding system.\n\
3132 When called from a program, takes three arguments:\n\
3133 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3134 Return length of decoded text.")
3135   (b, e, coding_system)
3136      Lisp_Object b, e, coding_system;
3137 {
3138   struct coding_system coding;
3139
3140   CHECK_NUMBER_COERCE_MARKER (b, 0);
3141   CHECK_NUMBER_COERCE_MARKER (e, 1);
3142   CHECK_SYMBOL (coding_system, 2);
3143
3144   if (NILP (coding_system))
3145     return make_number (XFASTINT (e) - XFASTINT (b));
3146   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3147     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3148
3149   return code_convert_region (b, e, &coding, 0);
3150 }
3151
3152 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3153        3, 3, "r\nzCoding system: ",
3154   "Encode current region by specified coding system.\n\
3155 When called from a program, takes three arguments:\n\
3156 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3157 Return length of encoded text.")
3158   (b, e, coding_system)
3159      Lisp_Object b, e, coding_system;
3160 {
3161   struct coding_system coding;
3162
3163   CHECK_NUMBER_COERCE_MARKER (b, 0);
3164   CHECK_NUMBER_COERCE_MARKER (e, 1);
3165   CHECK_SYMBOL (coding_system, 2);
3166
3167   if (NILP (coding_system))
3168     return make_number (XFASTINT (e) - XFASTINT (b));
3169   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3170     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3171
3172   return code_convert_region (b, e, &coding, 1);
3173 }
3174
3175 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3176        2, 3, 0,
3177   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3178 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3179 of decoding.")
3180   (string, coding_system, nocopy)
3181      Lisp_Object string, coding_system, nocopy;
3182 {
3183   struct coding_system coding;
3184
3185   CHECK_STRING (string, 0);
3186   CHECK_SYMBOL (coding_system, 1);
3187
3188   if (NILP (coding_system))
3189     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3190   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3191     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3192
3193   return code_convert_string (string, &coding, 0, nocopy);
3194 }
3195
3196 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3197        2, 3, 0,
3198   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3199 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3200 of encoding.")
3201   (string, coding_system, nocopy)
3202      Lisp_Object string, coding_system, nocopy;
3203 {
3204   struct coding_system coding;
3205
3206   CHECK_STRING (string, 0);
3207   CHECK_SYMBOL (coding_system, 1);
3208
3209   if (NILP (coding_system))
3210     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3211   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3212     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3213
3214   return code_convert_string (string, &coding, 1, nocopy);
3215 }
3216
3217 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3218   "Decode a JISX0208 character of shift-jis encoding.\n\
3219 CODE is the character code in SJIS.\n\
3220 Return the corresponding character.")
3221   (code)
3222      Lisp_Object code;
3223 {
3224   unsigned char c1, c2, s1, s2;
3225   Lisp_Object val;
3226
3227   CHECK_NUMBER (code, 0);
3228   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3229   DECODE_SJIS (s1, s2, c1, c2);
3230   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3231   return val;
3232 }
3233
3234 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3235   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3236 Return the corresponding character code in SJIS.")
3237   (ch)
3238      Lisp_Object ch;
3239 {
3240   int charset, c1, c2, s1, s2;
3241   Lisp_Object val;
3242
3243   CHECK_NUMBER (ch, 0);
3244   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3245   if (charset == charset_jisx0208)
3246     {
3247       ENCODE_SJIS (c1, c2, s1, s2);
3248       XSETFASTINT (val, (s1 << 8) | s2);
3249     }
3250   else
3251     XSETFASTINT (val, 0);
3252   return val;
3253 }
3254
3255 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3256   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3257 CODE is the character code in BIG5.\n\
3258 Return the corresponding character.")
3259   (code)
3260      Lisp_Object code;
3261 {
3262   int charset;
3263   unsigned char b1, b2, c1, c2;
3264   Lisp_Object val;
3265
3266   CHECK_NUMBER (code, 0);
3267   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3268   DECODE_BIG5 (b1, b2, charset, c1, c2);
3269   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3270   return val;
3271 }
3272
3273 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3274   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3275 Return the corresponding character code in Big5.")
3276   (ch)
3277      Lisp_Object ch;
3278 {
3279   int charset, c1, c2, b1, b2;
3280   Lisp_Object val;
3281
3282   CHECK_NUMBER (ch, 0);
3283   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3284   if (charset == charset_big5_1 || charset == charset_big5_2)
3285     {
3286       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3287       XSETFASTINT (val, (b1 << 8) | b2);
3288     }
3289   else
3290     XSETFASTINT (val, 0);
3291   return val;
3292 }
3293
3294 DEFUN ("set-terminal-coding-system",
3295        Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3296        "zCoding-system for terminal display: ",
3297   "Set coding-system of your terminal to CODING-SYSTEM.\n\
3298 All outputs to terminal are encoded to this coding-system.")
3299   (coding_system)
3300      Lisp_Object coding_system;
3301 {
3302   CHECK_SYMBOL (coding_system, 0);
3303   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3304   update_mode_lines++;
3305   if (!NILP (Finteractive_p ()))
3306     Fredraw_display ();
3307   return Qnil;
3308 }
3309
3310 DEFUN ("terminal-coding-system",
3311        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3312   "Return coding-system of your terminal.")
3313   ()
3314 {
3315   return terminal_coding.symbol;
3316 }
3317
3318 DEFUN ("set-keyboard-coding-system",
3319        Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1,
3320        "zCoding-system for keyboard input: ",
3321   "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
3322 All inputs from terminal are decoded from this coding-system.")
3323   (coding_system)
3324      Lisp_Object coding_system;
3325 {
3326   CHECK_SYMBOL (coding_system, 0);
3327   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3328   return Qnil;
3329 }
3330
3331 DEFUN ("keyboard-coding-system",
3332        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3333   "Return coding-system of what is sent from terminal keyboard.")
3334   ()
3335 {
3336   return keyboard_coding.symbol;
3337 }
3338
3339 \f
3340 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3341        1, MANY, 0,
3342   "Choose a coding system for a file operation based on file name.\n\
3343 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3344 ENCODING-SYSTEM is the coding system to use for encoding\n\
3345 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3346 for decoding (in case OPERATION does decoding).\n\
3347 \n\
3348 The first argument OPERATION specifies an I/O primitive:\n\
3349   For file I/O, `insert-file-contents' or `write-region'.\n\
3350   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3351   For network I/O, `open-network-stream'.\n\
3352 \n\
3353 The remaining arguments should be the same arguments that were passed\n\
3354 to the primitive.  Depending on which primitive, one of those arguments\n\
3355 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3356 whichever argument specifies the file name is TARGET.\n\
3357 \n\
3358 TARGET has a meaning which depends on OPERATION:\n\
3359   For file I/O, TARGET is a file name.\n\
3360   For process I/O, TARGET is a process name.\n\
3361   For network I/O, TARGET is a service name or a port number\n\
3362 \n\
3363 This function looks up what `coding-system-alist' specifies for\n\
3364 OPERATION and TARGET.  It may specify a cons cell which represents\n\
3365 a particular coding system or it may have a function to call.\n\
3366 In the latter case, we call the function with one argument,\n\
3367 which is a list of all the arguments given to `find-coding-system'.")
3368   (nargs, args)
3369      int nargs;
3370      Lisp_Object *args;
3371 {
3372   Lisp_Object operation, target_idx, target, val;
3373   register Lisp_Object chain;
3374
3375   if (nargs < 2)
3376     error ("Too few arguments");
3377   operation = args[0];
3378   if (!SYMBOLP (operation)
3379       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3380     error ("Invalid first arguement");
3381   if (nargs < 1 + XINT (target_idx))
3382     error ("Too few arguments for operation: %s",
3383            XSYMBOL (operation)->name->data);
3384   target = args[XINT (target_idx) + 1];
3385   if (!(STRINGP (target)
3386         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3387     error ("Invalid %dth argument", XINT (target_idx) + 1);
3388
3389   chain = Fassq (operation, Vcoding_system_alist);
3390   if (NILP (chain))
3391     return Qnil;
3392
3393   for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3394     {
3395       Lisp_Object elt = XCONS (chain)->car;
3396
3397       if (CONSP (elt)
3398           && ((STRINGP (target)
3399                && STRINGP (XCONS (elt)->car)
3400                && fast_string_match (XCONS (elt)->car, target) >= 0)
3401               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3402         return (val = XCONS (elt)->cdr, CONSP (val)
3403                 ? val
3404                 : ((SYMBOLP (val) && !NILP (Fboundp (val))
3405                     ? call2 (val, Flist (nargs, args))
3406                     : Qnil)));
3407     }
3408   return Qnil;
3409 }
3410
3411 #endif /* emacs */
3412
3413 \f
3414 /*** 8. Post-amble ***/
3415
3416 init_coding_once ()
3417 {
3418   int i;
3419
3420   /* Emacs internal format specific initialize routine.  */
3421   for (i = 0; i <= 0x20; i++)
3422     emacs_code_class[i] = EMACS_control_code;
3423   emacs_code_class[0x0A] = EMACS_linefeed_code;
3424   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3425   for (i = 0x21 ; i < 0x7F; i++)
3426     emacs_code_class[i] = EMACS_ascii_code;
3427   emacs_code_class[0x7F] = EMACS_control_code;
3428   emacs_code_class[0x80] = EMACS_leading_code_composition;
3429   for (i = 0x81; i < 0xFF; i++)
3430     emacs_code_class[i] = EMACS_invalid_code;
3431   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3432   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3433   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3434   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3435
3436   /* ISO2022 specific initialize routine.  */
3437   for (i = 0; i < 0x20; i++)
3438     iso_code_class[i] = ISO_control_code;
3439   for (i = 0x21; i < 0x7F; i++)
3440     iso_code_class[i] = ISO_graphic_plane_0;
3441   for (i = 0x80; i < 0xA0; i++)
3442     iso_code_class[i] = ISO_control_code;
3443   for (i = 0xA1; i < 0xFF; i++)
3444     iso_code_class[i] = ISO_graphic_plane_1;
3445   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3446   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3447   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3448   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3449   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3450   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3451   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3452   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3453   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3454   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3455
3456   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3457   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3458
3459   setup_coding_system (Qnil, &keyboard_coding);
3460   setup_coding_system (Qnil, &terminal_coding);
3461 }
3462
3463 #ifdef emacs
3464
3465 syms_of_coding ()
3466 {
3467   Qtarget_idx = intern ("target-idx");
3468   staticpro (&Qtarget_idx);
3469
3470   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3471   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3472
3473   Qcall_process = intern ("call-process");
3474   staticpro (&Qcall_process);
3475   Fput (Qcall_process, Qtarget_idx, make_number (0));
3476
3477   Qcall_process_region = intern ("call-process-region");
3478   staticpro (&Qcall_process_region);
3479   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3480
3481   Qstart_process = intern ("start-process");
3482   staticpro (&Qstart_process);
3483   Fput (Qstart_process, Qtarget_idx, make_number (2));
3484
3485   Qopen_network_stream = intern ("open-network-stream");
3486   staticpro (&Qopen_network_stream);
3487   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3488
3489   Qcoding_system = intern ("coding-system");
3490   staticpro (&Qcoding_system);
3491
3492   Qeol_type = intern ("eol-type");
3493   staticpro (&Qeol_type);
3494
3495   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3496   staticpro (&Qbuffer_file_coding_system);
3497
3498   Qpost_read_conversion = intern ("post-read-conversion");
3499   staticpro (&Qpost_read_conversion);
3500
3501   Qpre_write_conversion = intern ("pre-write-conversion");
3502   staticpro (&Qpre_write_conversion);
3503
3504   Qcoding_system_vector = intern ("coding-system-vector");
3505   staticpro (&Qcoding_system_vector);
3506
3507   Qcoding_system_p = intern ("coding-system-p");
3508   staticpro (&Qcoding_system_p);
3509
3510   Qcoding_system_error = intern ("coding-system-error");
3511   staticpro (&Qcoding_system_error);
3512
3513   Fput (Qcoding_system_error, Qerror_conditions,
3514         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3515   Fput (Qcoding_system_error, Qerror_message,
3516         build_string ("Coding-system error"));
3517
3518   Qcoding_category_index = intern ("coding-category-index");
3519   staticpro (&Qcoding_category_index);
3520
3521   {
3522     int i;
3523     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3524       {
3525         coding_category_table[i] = intern (coding_category_name[i]);
3526         staticpro (&coding_category_table[i]);
3527         Fput (coding_category_table[i], Qcoding_category_index,
3528               make_number (i));
3529       }
3530   }
3531
3532   defsubr (&Scoding_system_vector);
3533   defsubr (&Scoding_system_p);
3534   defsubr (&Sread_coding_system);
3535   defsubr (&Sread_non_nil_coding_system);
3536   defsubr (&Scheck_coding_system);
3537   defsubr (&Sdetect_coding_region);
3538   defsubr (&Sdecode_coding_region);
3539   defsubr (&Sencode_coding_region);
3540   defsubr (&Sdecode_coding_string);
3541   defsubr (&Sencode_coding_string);
3542   defsubr (&Sdecode_sjis_char);
3543   defsubr (&Sencode_sjis_char);
3544   defsubr (&Sdecode_big5_char);
3545   defsubr (&Sencode_big5_char);
3546   defsubr (&Sset_terminal_coding_system);
3547   defsubr (&Sterminal_coding_system);
3548   defsubr (&Sset_keyboard_coding_system);
3549   defsubr (&Skeyboard_coding_system);
3550   defsubr (&Sfind_coding_system);
3551
3552   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3553     "List of coding-categories (symbols) ordered by priority.");
3554   {
3555     int i;
3556
3557     Vcoding_category_list = Qnil;
3558     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3559       Vcoding_category_list
3560         = Fcons (coding_category_table[i], Vcoding_category_list);
3561   }
3562
3563   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3564     "A variable of internal use only.\n\
3565 If the value is a coding system, it is used for decoding on read operation.\n\
3566 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3567   Vcoding_system_for_read = Qnil;
3568
3569   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3570     "A variable of internal use only.\n\
3571 If the value is a coding system, it is used for encoding on write operation.\n\
3572 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3573   Vcoding_system_for_write = Qnil;
3574
3575   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3576     "Coding-system used in the latest file or process I/O.");
3577   Vlast_coding_system_used = Qnil;
3578
3579   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3580     "Nested alist to decide a coding system for a specific I/O operation.\n\
3581 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
3582 \n\
3583 OPERATION is one of the following Emacs I/O primitives:\n\
3584   For file I/O, insert-file-contents and write-region.\n\
3585   For process I/O, call-process, call-process-region, and start-process.\n\
3586   For network I/O, open-network-stream.\n\
3587 In addition, for process I/O, `process-argument' can be specified for\n\
3588 encoding arguments of the process.\n\
3589 \n\
3590 REGEXP is a regular expression matching a target of OPERATION, where\n\
3591 target is a file name for file I/O operations, a process name for\n\
3592 process I/O operations, or a service name for network I/O\n\
3593 operations.  REGEXP might be a port number for network I/O operation.\n\
3594 \n\
3595 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3596 character code on OPERATION, or a function symbol returning the cons.\n\
3597 See the documentation of `find-coding-system' for more detail.");
3598   Vcoding_system_alist = Qnil;
3599
3600   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3601     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3602   eol_mnemonic_unix = '.';
3603
3604   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3605     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3606   eol_mnemonic_dos = ':';
3607
3608   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3609     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3610   eol_mnemonic_mac = '\'';
3611
3612   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3613     "Mnemonic character indicating end-of-line format is not yet decided.");
3614   eol_mnemonic_undecided = '-';
3615
3616   DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table,
3617     "Alist of charsets vs the alternate charsets.\n\
3618 While decoding, if a charset (car part of an element) is found,\n\
3619 decode it as the alternate charset (cdr part of the element).");
3620   Valternate_charset_table = Qnil;
3621
3622   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3623     "Alist of charsets vs revision numbers.\n\
3624 While encoding, if a charset (car part of an element) is found,\n\
3625 designate it with the escape sequence identifing revision (cdr part of the element).");
3626   Vcharset_revision_alist = Qnil;
3627 }
3628
3629 #endif /* emacs */