src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Ver.1.0.
   3
   4    Copyright (C) 1995 Free Software Foundation, Inc.
   5    Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program; if not, write to the Free Software
  19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  20
  21 /*** TABLE OF CONTENTS ***
  22
  23   1. Preamble
  24   2. Emacs' internal format handlers
  25   3. ISO2022 handlers
  26   4. Shift-JIS and BIG5 handlers
  27   5. End-of-line handlers
  28   6. C library functions
  29   7. Emacs Lisp library functions
  30   8. Post-amble
  31
  32 */
  33
  34 /*** GENERAL NOTE on CODING SYSTEM ***
  35
  36   Coding system is an encoding mechanism of one or more character
  37   sets.  Here's a list of coding systems which Emacs can handle.  When
  38   we say "decode", it means converting some other coding system to
  39   Emacs' internal format, and when we say "encode", it means
  40   converting Emacs' internal format to some other coding system.
  41
  42   0. Emacs' internal format
  43
  44   Emacs itself holds a multi-lingual character in a buffer and a string
  45   in a special format.  Details are described in the section 2.
  46
  47   1. ISO2022
  48
  49   The most famous coding system for multiple character sets.  X's
  50   Compound Text, various EUCs (Extended Unix Code), and such coding
  51   systems used in Internet communication as ISO-2022-JP are all
  52   variants of ISO2022.  Details are described in the section 3.
  53
  54   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  55
  56   A coding system to encode character sets: ASCII, JISX0201, and
  57   JISX0208.  Widely used for PC's in Japan.  Details are described in
  58   the section 4.
  59
  60   3. BIG5
  61
  62   A coding system to encode character sets: ASCII and Big5.  Widely
  63   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  64   described in the section 4.  In this file, when written as "BIG5"
  65   (all uppercase), it means the coding system, and when written as
  66   "Big5" (capitalized), it means the character set.
  67
  68   4. Else
  69
  70   If a user want to read/write a text encoded in a coding system not
  71   listed above, he can supply a decoder and an encoder for it in CCL
  72   (Code Conversion Language) programs.  Emacs executes the CCL program
  73   while reading/writing.
  74
  75   Emacs represent a coding-system by a Lisp symbol that has a property
  76   `coding-system'.  But, before actually using the coding-system, the
  77   information about it is set in a structure of type `struct
  78   coding_system' for rapid processing.  See the section 6 for more
  79   detail.
  80
  81 */
  82
  83 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  84
  85   How end-of-line of a text is encoded depends on a system.  For
  86   instance, Unix's format is just one byte of `line-feed' code,
  87   whereas DOS's format is two bytes sequence of `carriage-return' and
  88   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  89
  90   Since how characters in a text is encoded and how end-of-line is
  91   encoded is independent, any coding system described above can take
  92   any format of end-of-line.  So, Emacs has information of format of
  93   end-of-line in each coding-system.  See the section 6 for more
  94   detail.
  95
  96 */
  97
  98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
  99
 100   These functions check if a text between SRC and SRC_END is encoded
 101   in the coding system category XXX.  Each returns an integer value in
 102   which appropriate flag bits for the category XXX is set.  The flag
 103   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 104   template of these functions.  */
 105 #if 0
 106 int
 107 detect_coding_internal (src, src_end)
 108      unsigned char *src, *src_end;
 109 {
 110   ...
 111 }
 112 #endif
 113
 114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 115
 116   These functions decode SRC_BYTES length text at SOURCE encoded in
 117   CODING to Emacs' internal format.  The resulting text goes to a
 118   place pointed by DESTINATION, the length of which should not exceed
 119   DST_BYTES.  The bytes actually processed is returned as *CONSUMED.
 120   The return value is the length of the decoded text.  Below is a
 121   template of these functions.  */
 122 #if 0
 123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 124      struct coding_system *coding;
 125      unsigned char *source, *destination;
 126      int src_bytes, dst_bytes;
 127      int *consumed;
 128 {
 129   ...
 130 }
 131 #endif
 132
 133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 134
 135   These functions encode SRC_BYTES length text at SOURCE of Emacs
 136   internal format to CODING.  The resulting text goes to a place
 137   pointed by DESTINATION, the length of which should not exceed
 138   DST_BYTES.  The bytes actually processed is returned as *CONSUMED.
 139   The return value is the length of the encoded text.  Below is a
 140   template of these functions.  */
 141 #if 0
 142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 143      struct coding_system *coding;
 144      unsigned char *source, *destination;
 145      int src_bytes, dst_bytes;
 146      int *consumed;
 147 {
 148   ...
 149 }
 150 #endif
 151
 152 /*** COMMONLY USED MACROS ***/
 153
 154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 155    THREE_MORE_BYTES safely get one, two, and three bytes from the
 156    source text respectively.  If there are not enough bytes in the
 157    source, they jump to `label_end_of_loop'.  The caller should set
 158    variables `src' and `src_end' to appropriate areas in advance.  */
 159
 160 #define ONE_MORE_BYTE(c1)       \
 161   do {                          \
 162     if (src < src_end)          \
 163       c1 = *src++;              \
 164     else                        \
 165       goto label_end_of_loop;   \
 166   } while (0)
 167
 168 #define TWO_MORE_BYTES(c1, c2)  \
 169   do {                          \
 170     if (src + 1 < src_end)      \
 171       c1 = *src++, c2 = *src++; \
 172     else                        \
 173       goto label_end_of_loop;   \
 174   } while (0)
 175
 176 #define THREE_MORE_BYTES(c1, c2, c3)            \
 177   do {                                          \
 178     if (src + 2 < src_end)                      \
 179       c1 = *src++, c2 = *src++, c3 = *src++;    \
 180     else                                        \
 181       goto label_end_of_loop;                   \
 182   } while (0)
 183
 184 /* The following three macros DECODE_CHARACTER_ASCII,
 185    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 186    the multi-byte form of a character of each class at the place
 187    pointed by `dst'.  The caller should set the variable `dst' to
 188    point to an appropriate area and the variable `coding' to point to
 189    the coding-system of the currently decoding text in advance.  */
 190
 191 /* Decode one ASCII character C.  */
 192
 193 #define DECODE_CHARACTER_ASCII(c)                               \
 194   do {                                                          \
 195     if (COMPOSING_P (coding->composing))                        \
 196       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 197     else                                                        \
 198       *dst++ = (c);                                             \
 199   } while (0)
 200
 201 /* Decode one DIMENSION1 character of which charset is CHARSET and
 202    position-code is C.  */
 203
 204 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 205   do {                                                                  \
 206     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 207     if (COMPOSING_P (coding->composing))                                \
 208       *dst++ = leading_code + 0x20;                                     \
 209     else                                                                \
 210       *dst++ = leading_code;                                            \
 211     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 212       *dst++ = leading_code;                                            \
 213     *dst++ = (c) | 0x80;                                                \
 214   } while (0)
 215
 216 /* Decode one DIMENSION2 character of which charset is CHARSET and
 217    position-codes are C1 and C2.  */
 218
 219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 220   do {                                                  \
 221     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 222     *dst++ = (c2) | 0x80;                               \
 223   } while (0)
 224
 225 \f
 226 /*** 1. Preamble ***/
 227
 228 #include <stdio.h>
 229
 230 #ifdef emacs
 231
 232 #include <config.h>
 233 #include "lisp.h"
 234 #include "buffer.h"
 235 #include "charset.h"
 236 #include "ccl.h"
 237 #include "coding.h"
 238 #include "window.h"
 239
 240 #else  /* not emacs */
 241
 242 #include "mulelib.h"
 243
 244 #endif /* not emacs */
 245
 246 Lisp_Object Qcoding_system, Qeol_type;
 247 Lisp_Object Qbuffer_file_coding_system;
 248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 249
 250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 252 Lisp_Object Qstart_process, Qopen_network_stream;
 253 Lisp_Object Qtarget_idx;
 254
 255 /* Mnemonic character of each format of end-of-line.  */
 256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 257 /* Mnemonic character to indicate format of end-of-line is not yet
 258    decided.  */
 259 int eol_mnemonic_undecided;
 260
 261 #ifdef emacs
 262
 263 Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
 264
 265 /* Coding-systems are handed between Emacs Lisp programs and C internal
 266    routines by the following three variables.  */
 267 /* Coding-system for reading files and receiving data from process.  */
 268 Lisp_Object Vcoding_system_for_read;
 269 /* Coding-system for writing files and sending data to process.  */
 270 Lisp_Object Vcoding_system_for_write;
 271 /* Coding-system actually used in the latest I/O.  */
 272 Lisp_Object Vlast_coding_system_used;
 273
 274 /* Coding-system of what terminal accept for displaying.  */
 275 struct coding_system terminal_coding;
 276
 277 /* Coding-system of what is sent from terminal keyboard.  */
 278 struct coding_system keyboard_coding;
 279
 280 Lisp_Object Vcoding_system_alist;
 281
 282 #endif /* emacs */
 283
 284 Lisp_Object Qcoding_category_index;
 285
 286 /* List of symbols `coding-category-xxx' ordered by priority.  */
 287 Lisp_Object Vcoding_category_list;
 288
 289 /* Table of coding-systems currently assigned to each coding-category.  */
 290 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 291
 292 /* Table of names of symbol for each coding-category.  */
 293 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 294   "coding-category-internal",
 295   "coding-category-sjis",
 296   "coding-category-iso-7",
 297   "coding-category-iso-8-1",
 298   "coding-category-iso-8-2",
 299   "coding-category-iso-else",
 300   "coding-category-big5",
 301   "coding-category-binary"
 302 };
 303
 304 /* Alist of charsets vs the alternate charsets.  */
 305 Lisp_Object Valternate_charset_table;
 306
 307 /* Alist of charsets vs revision number.  */
 308 Lisp_Object Vcharset_revision_alist;
 309
 310 \f
 311 /*** 2. Emacs internal format handlers ***/
 312
 313 /* Emacs' internal format for encoding multiple character sets is a
 314    kind of multi-byte encoding, i.e. encoding a character by a sequence
 315    of one-byte codes of variable length.  ASCII characters and control
 316    characters (e.g. `tab', `newline') are represented by one-byte as
 317    is.  It takes the range 0x00 through 0x7F.  The other characters
 318    are represented by a sequence of `base leading-code', optional
 319    `extended leading-code', and one or two `position-code's.  Length
 320    of the sequence is decided by the base leading-code.  Leading-code
 321    takes the range 0x80 through 0x9F, whereas extended leading-code
 322    and position-code take the range 0xA0 through 0xFF.  See the
 323    document of `charset.h' for more detail about leading-code and
 324    position-code.
 325
 326    There's one exception in this rule.  Special leading-code
 327    `leading-code-composition' denotes that the following several
 328    characters should be composed into one character.  Leading-codes of
 329    components (except for ASCII) are added 0x20.  An ASCII character
 330    component is represented by a 2-byte sequence of `0xA0' and
 331    `ASCII-code + 0x80'.  See also the document in `charset.h' for the
 332    detail of composite character.  Hence, we can summarize the code
 333    range as follows:
 334
 335    --- CODE RANGE of Emacs' internal format ---
 336    (character set)      (range)
 337    ASCII                0x00 .. 0x7F
 338    ELSE (1st byte)      0x80 .. 0x9F
 339         (rest bytes)    0xA0 .. 0xFF
 340    ---------------------------------------------
 341
 342   */
 343
 344 enum emacs_code_class_type emacs_code_class[256];
 345
 346 /* Go to the next statement only if *SRC is accessible and the code is
 347    greater than 0xA0.  */
 348 #define CHECK_CODE_RANGE_A0_FF  \
 349   do {                          \
 350     if (src >= src_end)         \
 351       goto label_end_of_switch; \
 352     else if (*src++ < 0xA0)     \
 353       return 0;                 \
 354   } while (0)
 355
 356 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 357    Check if a text is encoded in Emacs' internal format.  If it is,
 358    return CODING_CATEGORY_MASK_INTERNAL, else return 0.  */
 359
 360 int
 361 detect_coding_internal (src, src_end)
 362      unsigned char *src, *src_end;
 363 {
 364   unsigned char c;
 365   int composing = 0;
 366
 367   while (src < src_end)
 368     {
 369       c = *src++;
 370
 371       if (composing)
 372         {
 373           if (c < 0xA0)
 374             composing = 0;
 375           else
 376             c -= 0x20;
 377         }
 378
 379       switch (emacs_code_class[c])
 380         {
 381         case EMACS_ascii_code:
 382         case EMACS_linefeed_code:
 383           break;
 384
 385         case EMACS_control_code:
 386           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 387             return 0;
 388           break;
 389
 390         case EMACS_invalid_code:
 391           return 0;
 392
 393         case EMACS_leading_code_composition: /* c == 0x80 */
 394           if (composing)
 395             CHECK_CODE_RANGE_A0_FF;
 396           else
 397             composing = 1;
 398           break;
 399
 400         case EMACS_leading_code_4:
 401           CHECK_CODE_RANGE_A0_FF;
 402           /* fall down to check it two more times ...  */
 403
 404         case EMACS_leading_code_3:
 405           CHECK_CODE_RANGE_A0_FF;
 406           /* fall down to check it one more time ...  */
 407
 408         case EMACS_leading_code_2:
 409           CHECK_CODE_RANGE_A0_FF;
 410           break;
 411
 412         default:
 413         label_end_of_switch:
 414           break;
 415         }
 416     }
 417   return CODING_CATEGORY_MASK_INTERNAL;
 418 }
 419
 420 \f
 421 /*** 3. ISO2022 handlers ***/
 422
 423 /* The following note describes the coding system ISO2022 briefly.
 424    Since the intension of this note is to help understanding of the
 425    programs in this file, some parts are NOT ACCURATE or OVERLY
 426    SIMPLIFIED.  For the thorough understanding, please refer to the
 427    original document of ISO2022.
 428
 429    ISO2022 provides many mechanisms to encode several character sets
 430    in 7-bit and 8-bit environment.  If one choose 7-bite environment,
 431    all text is encoded by codes of less than 128.  This may make the
 432    encoded text a little bit longer, but the text get more stability
 433    to pass through several gateways (some of them split MSB off).
 434
 435    There are two kind of character set: control character set and
 436    graphic character set.  The former contains control characters such
 437    as `newline' and `escape' to provide control functions (control
 438    functions are provided also by escape sequence).  The latter
 439    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 440    two control character sets and many graphic character sets.
 441
 442    Graphic character sets are classified into one of the following
 443    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 444    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 445    bytes (DIMENSION) and the number of characters in one dimension
 446    (CHARS) of the set.  In addition, each character set is assigned an
 447    identification tag (called "final character" and denoted as <F>
 448    here after) which is unique in each class.  <F> of each character
 449    set is decided by ECMA(*) when it is registered in ISO.  Code range
 450    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 451
 452    Note (*): ECMA = European Computer Manufacturers Association
 453
 454    Here are examples of graphic character set [NAME(<F>)]:
 455         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 456         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 457         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 458         o DIMENSION2_CHARS96 -- none for the moment
 459
 460    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 461         C0 [0x00..0x1F] -- control character plane 0
 462         GL [0x20..0x7F] -- graphic character plane 0
 463         C1 [0x80..0x9F] -- control character plane 1
 464         GR [0xA0..0xFF] -- graphic character plane 1
 465
 466    A control character set is directly designated and invoked to C0 or
 467    C1 by an escape sequence.  The most common case is that ISO646's
 468    control character set is designated/invoked to C0 and ISO6429's
 469    control character set is designated/invoked to C1, and usually
 470    these designations/invocations are omitted in a coded text.  With
 471    7-bit environment, only C0 can be used, and a control character for
 472    C1 is encoded by an appropriate escape sequence to fit in the
 473    environment.  All control characters for C1 are defined the
 474    corresponding escape sequences.
 475
 476    A graphic character set is at first designated to one of four
 477    graphic registers (G0 through G3), then these graphic registers are
 478    invoked to GL or GR.  These designations and invocations can be
 479    done independently.  The most common case is that G0 is invoked to
 480    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 481    these invocations and designations are omitted in a coded text.
 482    With 7-bit environment, only GL can be used.
 483
 484    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 485    and 0x7F of GL area work as control characters SPACE and DEL
 486    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 487
 488    There are two ways of invocation: locking-shift and single-shift.
 489    With locking-shift, the invocation lasts until the next different
 490    invocation, whereas with single-shift, the invocation works only
 491    for the following character and doesn't affect locking-shift.
 492    Invocations are done by the following control characters or escape
 493    sequences.
 494
 495    ----------------------------------------------------------------------
 496    function             control char    escape sequence description
 497    ----------------------------------------------------------------------
 498    SI  (shift-in)               0x0F    none            invoke G0 to GL
 499    SI  (shift-out)              0x0E    none            invoke G1 to GL
 500    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 501    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 502    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 503    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 504    ----------------------------------------------------------------------
 505    The first four are for locking-shift.  Control characters for these
 506    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 507
 508    Designations are done by the following escape sequences.
 509    ----------------------------------------------------------------------
 510    escape sequence      description
 511    ----------------------------------------------------------------------
 512    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 513    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 514    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 515    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 516    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 517    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 518    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 519    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 520    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 521    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 522    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 523    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 524    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 525    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 526    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 527    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 528    ----------------------------------------------------------------------
 529
 530    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 531    of dimension 1, chars 94, and final character <F>, and etc.
 532
 533    Note (*): Although these designations are not allowed in ISO2022,
 534    Emacs accepts them on decoding, and produces them on encoding
 535    CHARS96 character set in a coding system which is characterized as
 536    7-bit environment, non-locking-shift, and non-single-shift.
 537
 538    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 539    '(' can be omitted.  We call this as "short-form" here after.
 540
 541    Now you may notice that there are a lot of ways for encoding the
 542    same multilingual text in ISO2022.  Actually, there exist many
 543    coding systems such as Compound Text (used in X's inter client
 544    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 545    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 546    localized platforms), and all of these are variants of ISO2022.
 547
 548    In addition to the above, Emacs handles two more kinds of escape
 549    sequences: ISO6429's direction specification and Emacs' private
 550    sequence for specifying character composition.
 551
 552    ISO6429's direction specification takes the following format:
 553         o CSI ']'      -- end of the current direction
 554         o CSI '0' ']'  -- end of the current direction
 555         o CSI '1' ']'  -- start of left-to-right text
 556         o CSI '2' ']'  -- start of right-to-left text
 557    The control character CSI (0x9B: control sequence introducer) is
 558    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 559
 560    Character composition specification takes the following format:
 561         o ESC '0' -- start character composition
 562         o ESC '1' -- end character composition
 563    Since these are not standard escape sequences of any ISO, the use
 564    of them for these meaning is restricted to Emacs only.  */
 565
 566 enum iso_code_class_type iso_code_class[256];
 567
 568 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 569    Check if a text is encoded in ISO2022.  If it is, returns an
 570    integer in which appropriate flag bits any of:
 571         CODING_CATEGORY_MASK_ISO_7
 572         CODING_CATEGORY_MASK_ISO_8_1
 573         CODING_CATEGORY_MASK_ISO_8_2
 574         CODING_CATEGORY_MASK_ISO_ELSE
 575    are set.  If a code which should never appear in ISO2022 is found,
 576    returns 0.  */
 577
 578 int
 579 detect_coding_iso2022 (src, src_end)
 580      unsigned char *src, *src_end;
 581 {
 582   unsigned char graphic_register[4];
 583   unsigned char c, esc_cntl;
 584   int mask = (CODING_CATEGORY_MASK_ISO_7
 585               | CODING_CATEGORY_MASK_ISO_8_1
 586               | CODING_CATEGORY_MASK_ISO_8_2);
 587   /* We may look ahead maximum 3 bytes.  */
 588   unsigned char *adjusted_src_end = src_end - 3;
 589   int i;
 590
 591   for (i = 0; i < 4; i++)
 592     graphic_register[i] = CHARSET_ASCII;
 593
 594   while (src < adjusted_src_end)
 595     {
 596       c = *src++;
 597       switch (c)
 598         {
 599         case ISO_CODE_ESC:
 600           if (src >= adjusted_src_end)
 601             break;
 602           c = *src++;
 603           if (c == '$')
 604             {
 605               /* Designation of 2-byte character set.  */
 606               if (src >= adjusted_src_end)
 607                 break;
 608               c = *src++;
 609             }
 610           if ((c >= ')' && c <= '+') || (c >= '-' && c <= '/'))
 611             /* Designation to graphic register 1, 2, or 3.  */
 612             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 613           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 614             return CODING_CATEGORY_MASK_ISO_ELSE;
 615           break;
 616
 617         case ISO_CODE_SI:
 618         case ISO_CODE_SO:
 619           return CODING_CATEGORY_MASK_ISO_ELSE;
 620
 621         case ISO_CODE_CSI:
 622         case ISO_CODE_SS2:
 623         case ISO_CODE_SS3:
 624           mask &= ~CODING_CATEGORY_MASK_ISO_7;
 625           break;
 626
 627         default:
 628           if (c < 0x80)
 629             break;
 630           else if (c < 0xA0)
 631             return 0;
 632           else
 633             {
 634               int count = 1;
 635
 636               mask &= ~CODING_CATEGORY_MASK_ISO_7;
 637               while (src < adjusted_src_end && *src >= 0xA0)
 638                 count++, src++;
 639               if (count & 1 && src < adjusted_src_end)
 640                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 641             }
 642           break;
 643         }
 644     }
 645
 646   return mask;
 647 }
 648
 649 /* Decode a character of which charset is CHARSET and the 1st position
 650    code is C1.  If dimension of CHARSET 2, the 2nd position code is
 651    fetched from SRC and set to C2.  If CHARSET is negative, it means
 652    that we are decoding ill formed text, and what we can do is just to
 653    read C1 as is.  */
 654
 655 #define DECODE_ISO_CHARACTER(charset, c1)                       \
 656   do {                                                          \
 657     if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2)     \
 658       ONE_MORE_BYTE (c2);                                       \
 659     if (COMPOSING_HEAD_P (coding->composing))                   \
 660       {                                                         \
 661         *dst++ = LEADING_CODE_COMPOSITION;                      \
 662         if (COMPOSING_WITH_RULE_P (coding->composing))          \
 663           /* To tell composition rules are embeded.  */         \
 664           *dst++ = 0xFF;                                        \
 665         coding->composing += 2;                                 \
 666       }                                                         \
 667     if ((charset) < 0)                                          \
 668       *dst++ = c1;                                              \
 669     else if ((charset) == CHARSET_ASCII)                        \
 670       DECODE_CHARACTER_ASCII (c1);                              \
 671     else if (CHARSET_DIMENSION (charset) == 1)                  \
 672       DECODE_CHARACTER_DIMENSION1 (charset, c1);                \
 673     else                                                        \
 674       DECODE_CHARACTER_DIMENSION2 (charset, c1, c2);            \
 675     if (COMPOSING_WITH_RULE_P (coding->composing))              \
 676       /* To tell a composition rule follows.  */                \
 677       coding->composing = COMPOSING_WITH_RULE_RULE;             \
 678   } while (0)
 679
 680 /* Set designation state into CODING.  */
 681 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 682   do {                                                                  \
 683     int charset = ISO_CHARSET_TABLE (dimension, chars, final_char);     \
 684     Lisp_Object temp                                                    \
 685       = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table);     \
 686     if (! NILP (temp))                                                  \
 687       charset = get_charset_id (XCONS (temp)->cdr);                     \
 688     if (charset >= 0)                                                   \
 689       {                                                                 \
 690         if (coding->direction == 1                                      \
 691             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 692           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 693         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 694       }                                                                 \
 695   } while (0)
 696
 697 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 698
 699 int
 700 decode_coding_iso2022 (coding, source, destination,
 701                        src_bytes, dst_bytes, consumed)
 702      struct coding_system *coding;
 703      unsigned char *source, *destination;
 704      int src_bytes, dst_bytes;
 705      int *consumed;
 706 {
 707   unsigned char *src = source;
 708   unsigned char *src_end = source + src_bytes;
 709   unsigned char *dst = destination;
 710   unsigned char *dst_end = destination + dst_bytes;
 711   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 712      from DST_END to assure that overflow checking is necessary only
 713      at the head of loop.  */
 714   unsigned char *adjusted_dst_end = dst_end - 6;
 715   int charset;
 716   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 717   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 718   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 719
 720   while (src < src_end && dst < adjusted_dst_end)
 721     {
 722       /* SRC_BASE remembers the start position in source in each loop.
 723          The loop will be exited when there's not enough source text
 724          to analyze long escape sequence or 2-byte code (within macros
 725          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 726          to SRC_BASE before exiting.  */
 727       unsigned char *src_base = src;
 728       unsigned char c1 = *src++, c2, cmprule;
 729
 730       switch (iso_code_class [c1])
 731         {
 732         case ISO_0x20_or_0x7F:
 733           if (!coding->composing
 734               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 735             {
 736               /* This is SPACE or DEL.  */
 737               *dst++ = c1;
 738               break;
 739             }
 740           /* This is a graphic character, we fall down ...  */
 741
 742         case ISO_graphic_plane_0:
 743           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 744             {
 745               /* This is a composition rule.  */
 746               *dst++ = c1 | 0x80;
 747               coding->composing = COMPOSING_WITH_RULE_TAIL;
 748             }
 749           else
 750             DECODE_ISO_CHARACTER (charset0, c1);
 751           break;
 752
 753         case ISO_0xA0_or_0xFF:
 754           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 755             {
 756               /* Invalid code.  */
 757               *dst++ = c1;
 758               break;
 759             }
 760           /* This is a graphic character, we fall down ... */
 761
 762         case ISO_graphic_plane_1:
 763           DECODE_ISO_CHARACTER (charset1, c1);
 764           break;
 765
 766         case ISO_control_code:
 767           /* All ISO2022 control characters in this class have the
 768              same representation in Emacs internal format.  */
 769           *dst++ = c1;
 770           break;
 771
 772         case ISO_carriage_return:
 773           if (coding->eol_type == CODING_EOL_CR)
 774             {
 775               *dst++ = '\n';
 776             }
 777           else if (coding->eol_type == CODING_EOL_CRLF)
 778             {
 779               ONE_MORE_BYTE (c1);
 780               if (c1 == ISO_CODE_LF)
 781                 *dst++ = '\n';
 782               else
 783                 {
 784                   src--;
 785                   *dst++ = c1;
 786                 }
 787             }
 788           else
 789             {
 790               *dst++ = c1;
 791             }
 792           break;
 793
 794         case ISO_shift_out:
 795           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 796           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 797           break;
 798
 799         case ISO_shift_in:
 800           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 801           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 802           break;
 803
 804         case ISO_single_shift_2_7:
 805         case ISO_single_shift_2:
 806           /* SS2 is handled as an escape sequence of ESC 'N' */
 807           c1 = 'N';
 808           goto label_escape_sequence;
 809
 810         case ISO_single_shift_3:
 811           /* SS2 is handled as an escape sequence of ESC 'O' */
 812           c1 = 'O';
 813           goto label_escape_sequence;
 814
 815         case ISO_control_sequence_introducer:
 816           /* CSI is handled as an escape sequence of ESC '[' ...  */
 817           c1 = '[';
 818           goto label_escape_sequence;
 819
 820         case ISO_escape:
 821           ONE_MORE_BYTE (c1);
 822         label_escape_sequence:
 823           /* Escape sequences handled by Emacs are invocation,
 824              designation, direction specification, and character
 825              composition specification.  */
 826           switch (c1)
 827             {
 828             case '&':           /* revision of following character set */
 829               ONE_MORE_BYTE (c1);
 830               if (!(c1 >= '@' && c1 <= '~'))
 831                 {
 832                   goto label_invalid_escape_sequence;
 833                 }
 834               ONE_MORE_BYTE (c1);
 835               if (c1 != ISO_CODE_ESC)
 836                 {
 837                   goto label_invalid_escape_sequence;
 838                 }
 839               ONE_MORE_BYTE (c1);
 840               goto label_escape_sequence;
 841
 842             case '$':           /* designation of 2-byte character set */
 843               ONE_MORE_BYTE (c1);
 844               if (c1 >= '@' && c1 <= 'B')
 845                 {       /* designation of JISX0208.1978, GB2312.1980,
 846                                    or JISX0208.1980 */
 847                   DECODE_DESIGNATION (0, 2, 94, c1);
 848                 }
 849               else if (c1 >= 0x28 && c1 <= 0x2B)
 850                 {       /* designation of DIMENSION2_CHARS94 character set */
 851                   ONE_MORE_BYTE (c2);
 852                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 853                 }
 854               else if (c1 >= 0x2C && c1 <= 0x2F)
 855                 {       /* designation of DIMENSION2_CHARS96 character set */
 856                   ONE_MORE_BYTE (c2);
 857                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 858                 }
 859               else
 860                 {
 861                   goto label_invalid_escape_sequence;
 862                 }
 863               break;
 864
 865             case 'n':           /* invocation of locking-shift-2 */
 866               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 867               break;
 868
 869             case 'o':           /* invocation of locking-shift-3 */
 870               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 871               break;
 872
 873             case 'N':           /* invocation of single-shift-2 */
 874               ONE_MORE_BYTE (c1);
 875               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 876               DECODE_ISO_CHARACTER (charset, c1);
 877               break;
 878
 879             case 'O':           /* invocation of single-shift-3 */
 880               ONE_MORE_BYTE (c1);
 881               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 882               DECODE_ISO_CHARACTER (charset, c1);
 883               break;
 884
 885             case '0':           /* start composing without embeded rules */
 886               coding->composing = COMPOSING_NO_RULE_HEAD;
 887               break;
 888
 889             case '1':           /* end composing */
 890               coding->composing = COMPOSING_NO;
 891               break;
 892
 893             case '2':           /* start composing with embeded rules */
 894               coding->composing = COMPOSING_WITH_RULE_HEAD;
 895               break;
 896
 897             case '[':           /* specification of direction */
 898               /* For the moment, nested direction is not supported.
 899                  So, the value of `coding->direction' is 0 or 1: 0
 900                  means left-to-right, 1 means right-to-left.  */
 901               ONE_MORE_BYTE (c1);
 902               switch (c1)
 903                 {
 904                 case ']':       /* end of the current direction */
 905                   coding->direction = 0;
 906
 907                 case '0':       /* end of the current direction */
 908                 case '1':       /* start of left-to-right direction */
 909                   ONE_MORE_BYTE (c1);
 910                   if (c1 == ']')
 911                     coding->direction = 0;
 912                   else
 913                     goto label_invalid_escape_sequence;
 914                   break;
 915
 916                 case '2':       /* start of right-to-left direction */
 917                   ONE_MORE_BYTE (c1);
 918                   if (c1 == ']')
 919                     coding->direction= 1;
 920                   else
 921                     goto label_invalid_escape_sequence;
 922                   break;
 923
 924                 default:
 925                   goto label_invalid_escape_sequence;
 926                 }
 927               break;
 928
 929             default:
 930               if (c1 >= 0x28 && c1 <= 0x2B)
 931                 {       /* designation of DIMENSION1_CHARS94 character set */
 932                   ONE_MORE_BYTE (c2);
 933                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
 934                 }
 935               else if (c1 >= 0x2C && c1 <= 0x2F)
 936                 {       /* designation of DIMENSION1_CHARS96 character set */
 937                   ONE_MORE_BYTE (c2);
 938                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
 939                 }
 940               else
 941                 {
 942                   goto label_invalid_escape_sequence;
 943                 }
 944             }
 945           /* We must update these variables now.  */
 946           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 947           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 948           break;
 949
 950         label_invalid_escape_sequence:
 951           {
 952             int length = src - src_base;
 953
 954             bcopy (src_base, dst, length);
 955             dst += length;
 956           }
 957         }
 958       continue;
 959
 960     label_end_of_loop:
 961       coding->carryover_size = src - src_base;
 962       bcopy (src_base, coding->carryover, coding->carryover_size);
 963       src = src_base;
 964       break;
 965     }
 966
 967   /* If this is the last block of the text to be decoded, we had
 968      better just flush out all remaining codes in the text although
 969      they are not valid characters.  */
 970   if (coding->last_block)
 971     {
 972       bcopy (src, dst, src_end - src);
 973       dst += (src_end - src);
 974       src = src_end;
 975     }
 976   *consumed = src - source;
 977   return dst - destination;
 978 }
 979
 980 /* ISO2022 encoding staffs.  */
 981
 982 /*
 983    It is not enough to say just "ISO2022" on encoding, but we have to
 984    specify more details.  In Emacs, each coding-system of ISO2022
 985    variant has the following specifications:
 986         1. Initial designation to G0 thru G3.
 987         2. Allows short-form designation?
 988         3. ASCII should be designated to G0 before control characters?
 989         4. ASCII should be designated to G0 at end of line?
 990         5. 7-bit environment or 8-bit environment?
 991         6. Use locking-shift?
 992         7. Use Single-shift?
 993    And the following two are only for Japanese:
 994         8. Use ASCII in place of JIS0201-1976-Roman?
 995         9. Use JISX0208-1983 in place of JISX0208-1978?
 996    These specifications are encoded in `coding->flags' as flag bits
 997    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
 998    detail.
 999 */
1000
1001 /* Produce codes (escape sequence) for designating CHARSET to graphic
1002    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1003    the coding system CODING allows, produce designation sequence of
1004    short-form.  */
1005
1006 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1007   do {                                                                  \
1008     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1009     char *intermediate_char_94 = "()*+";                                \
1010     char *intermediate_char_96 = ",-./";                                \
1011     Lisp_Object temp                                                    \
1012       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1013     if (! NILP (temp))                                                  \
1014         {                                                               \
1015         *dst++ = ISO_CODE_ESC;                                          \
1016         *dst++ = '&';                                                   \
1017         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1018       }                                                                 \
1019     *dst++ = ISO_CODE_ESC;                                              \
1020     if (CHARSET_DIMENSION (charset) == 1)                               \
1021       {                                                                 \
1022         if (CHARSET_CHARS (charset) == 94)                              \
1023           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1024         else                                                            \
1025           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1026       }                                                                 \
1027     else                                                                \
1028       {                                                                 \
1029         *dst++ = '$';                                                   \
1030         if (CHARSET_CHARS (charset) == 94)                              \
1031           {                                                             \
1032             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1033                 || reg != 0                                             \
1034                 || final_char < '@' || final_char > 'B')                \
1035               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1036           }                                                             \
1037         else                                                            \
1038           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1039       }                                                                 \
1040     *dst++ = final_char;                                                \
1041     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1042   } while (0)
1043
1044 /* The following two macros produce codes (control character or escape
1045    sequence) for ISO2022 single-shift functions (single-shift-2 and
1046    single-shift-3).  */
1047
1048 #define ENCODE_SINGLE_SHIFT_2                           \
1049   do {                                                  \
1050     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1051       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1052     else                                                \
1053       *dst++ = ISO_CODE_SS2;                            \
1054     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1055   } while (0)
1056
1057 #define ENCODE_SINGLE_SHIFT_3                           \
1058   do {                                                  \
1059     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1060       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1061     else                                                \
1062       *dst++ = ISO_CODE_SS3;                            \
1063     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1064   } while (0)
1065
1066 /* The following four macros produce codes (control character or
1067    escape sequence) for ISO2022 locking-shift functions (shift-in,
1068    shift-out, locking-shift-2, and locking-shift-3).  */
1069
1070 #define ENCODE_SHIFT_IN                         \
1071   do {                                          \
1072     *dst++ = ISO_CODE_SI;                       \
1073     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1074   } while (0)
1075
1076 #define ENCODE_SHIFT_OUT                        \
1077   do {                                          \
1078     *dst++ = ISO_CODE_SO;                       \
1079     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1080   } while (0)
1081
1082 #define ENCODE_LOCKING_SHIFT_2                  \
1083   do {                                          \
1084     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1085     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1086   } while (0)
1087
1088 #define ENCODE_LOCKING_SHIFT_3                  \
1089   do {                                          \
1090     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1091     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1092   } while (0)
1093
1094 /* Produce codes for a DIMENSION1 character of which character set is
1095    CHARSET and position-code is C1.  Designation and invocation
1096    sequences are also produced in advance if necessary.  */
1097
1098
1099 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1100   do {                                                                  \
1101     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1102       {                                                                 \
1103         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1104           *dst++ = c1 & 0x7F;                                           \
1105         else                                                            \
1106           *dst++ = c1 | 0x80;                                           \
1107         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1108         break;                                                          \
1109       }                                                                 \
1110     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1111       {                                                                 \
1112         *dst++ = c1 & 0x7F;                                             \
1113         break;                                                          \
1114       }                                                                 \
1115     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1116       {                                                                 \
1117         *dst++ = c1 | 0x80;                                             \
1118         break;                                                          \
1119       }                                                                 \
1120     else                                                                \
1121       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1122          must invoke it, or, at first, designate it to some graphic     \
1123          register.  Then repeat the loop to actually produce the        \
1124          character.  */                                                 \
1125       dst = encode_invocation_designation (charset, coding, dst);       \
1126   } while (1)
1127
1128 /* Produce codes for a DIMENSION2 character of which character set is
1129    CHARSET and position-codes are C1 and C2.  Designation and
1130    invocation codes are also produced in advance if necessary.  */
1131
1132 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1133   do {                                                                  \
1134     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1135       {                                                                 \
1136         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1137           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1138         else                                                            \
1139           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1140         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1141         break;                                                          \
1142       }                                                                 \
1143     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1144       {                                                                 \
1145         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1146         break;                                                          \
1147       }                                                                 \
1148     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1149       {                                                                 \
1150         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1151         break;                                                          \
1152       }                                                                 \
1153     else                                                                \
1154       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1155          must invoke it, or, at first, designate it to some graphic     \
1156          register.  Then repeat the loop to actually produce the        \
1157          character.  */                                                 \
1158       dst = encode_invocation_designation (charset, coding, dst);       \
1159   } while (1)
1160
1161 /* Produce designation and invocation codes at a place pointed by DST
1162    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1163    Return new DST.  */
1164
1165 unsigned char *
1166 encode_invocation_designation (charset, coding, dst)
1167      int charset;
1168      struct coding_system *coding;
1169      unsigned char *dst;
1170 {
1171   int reg;                      /* graphic register number */
1172
1173   /* At first, check designations.  */
1174   for (reg = 0; reg < 4; reg++)
1175     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1176       break;
1177
1178   if (reg >= 4)
1179     {
1180       /* CHARSET is not yet designated to any graphic registers.  */
1181       /* At first check the requested designation.  */
1182       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1183       if (reg < 0)
1184         /* Since CHARSET requests no special designation, designate to
1185            graphic register 0.  */
1186         reg = 0;
1187
1188       ENCODE_DESIGNATION (charset, reg, coding);
1189     }
1190
1191   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1192       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1193     {
1194       /* Since the graphic register REG is not invoked to any graphic
1195          planes, invoke it to graphic plane 0.  */
1196       switch (reg)
1197         {
1198         case 0:                 /* graphic register 0 */
1199           ENCODE_SHIFT_IN;
1200           break;
1201
1202         case 1:                 /* graphic register 1 */
1203           ENCODE_SHIFT_OUT;
1204           break;
1205
1206         case 2:                 /* graphic register 2 */
1207           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1208             ENCODE_SINGLE_SHIFT_2;
1209           else
1210             ENCODE_LOCKING_SHIFT_2;
1211           break;
1212
1213         case 3:                 /* graphic register 3 */
1214           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1215             ENCODE_SINGLE_SHIFT_3;
1216           else
1217             ENCODE_LOCKING_SHIFT_3;
1218           break;
1219         }
1220     }
1221   return dst;
1222 }
1223
1224 /* The following two macros produce codes for indicating composition.  */
1225 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1226 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1227 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1228
1229 /* The following three macros produce codes for indicating direction
1230    of text.  */
1231 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1232   do {                                                  \
1233     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1234       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1235     else                                                \
1236       *dst++ = ISO_CODE_CSI;                            \
1237   } while (0)
1238
1239 #define ENCODE_DIRECTION_R2L    \
1240   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1241
1242 #define ENCODE_DIRECTION_L2R    \
1243   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1244
1245 /* Produce codes for designation and invocation to reset the graphic
1246    planes and registers to initial state.  */
1247 #define ENCODE_RESET_PLANE_AND_REGISTER(eol)                                  \
1248   do {                                                                        \
1249     int reg;                                                                  \
1250     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                          \
1251       ENCODE_SHIFT_IN;                                                        \
1252     for (reg = 0; reg < 4; reg++)                                             \
1253       {                                                                       \
1254         if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) < 0)            \
1255           {                                                                   \
1256             if (eol) CODING_SPEC_ISO_DESIGNATION (coding, reg) = -1;          \
1257           }                                                                   \
1258         else if (CODING_SPEC_ISO_DESIGNATION (coding, reg)                    \
1259                  != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))        \
1260           ENCODE_DESIGNATION                                                  \
1261             (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1262       }                                                                       \
1263   } while (0)
1264
1265 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1266
1267 int
1268 encode_coding_iso2022 (coding, source, destination,
1269                        src_bytes, dst_bytes, consumed)
1270      struct coding_system *coding;
1271      unsigned char *source, *destination;
1272      int src_bytes, dst_bytes;
1273      int *consumed;
1274 {
1275   unsigned char *src = source;
1276   unsigned char *src_end = source + src_bytes;
1277   unsigned char *dst = destination;
1278   unsigned char *dst_end = destination + dst_bytes;
1279   /* Since the maximum bytes produced by each loop is 6, we subtract 5
1280      from DST_END to assure overflow checking is necessary only at the
1281      head of loop.  */
1282   unsigned char *adjusted_dst_end = dst_end - 5;
1283
1284   while (src < src_end && dst < adjusted_dst_end)
1285     {
1286       /* SRC_BASE remembers the start position in source in each loop.
1287          The loop will be exited when there's not enough source text
1288          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1289          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1290          reset to SRC_BASE before exiting.  */
1291       unsigned char *src_base = src;
1292       unsigned char c1 = *src++, c2, c3, c4;
1293       int charset;
1294
1295       /* If we are seeing a component of a composite character, we are
1296          seeing a leading-code specially encoded for composition, or a
1297          composition rule if composing with rule.  We must set C1
1298          to a normal leading-code or an ASCII code.  If we are not at
1299          a composed character, we must reset the composition state.  */
1300       if (COMPOSING_P (coding->composing))
1301         {
1302           if (c1 < 0xA0)
1303             {
1304               /* We are not in a composite character any longer.  */
1305               coding->composing = COMPOSING_NO;
1306               ENCODE_COMPOSITION_END;
1307             }
1308           else
1309             {
1310               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1311                 {
1312                   *dst++ = c1 & 0x7F;
1313                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1314                   continue;
1315                 }
1316               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1317                 coding->composing = COMPOSING_WITH_RULE_RULE;
1318               if (c1 == 0xA0)
1319                 {
1320                   /* This is an ASCII component.  */
1321                   ONE_MORE_BYTE (c1);
1322                   c1 &= 0x7F;
1323                 }
1324               else
1325                 /* This is a leading-code of non ASCII component.  */
1326                 c1 -= 0x20;
1327             }
1328         }
1329
1330       /* Now encode one character.  C1 is a control character, an
1331          ASCII character, or a leading-code of multi-byte character.  */
1332       switch (emacs_code_class[c1])
1333         {
1334         case EMACS_ascii_code:
1335           ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1);
1336           break;
1337
1338         case EMACS_control_code:
1339           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1340             ENCODE_RESET_PLANE_AND_REGISTER (0);
1341           *dst++ = c1;
1342           break;
1343
1344         case EMACS_carriage_return_code:
1345           if (!coding->selective)
1346             {
1347               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1348                 ENCODE_RESET_PLANE_AND_REGISTER (0);
1349               *dst++ = c1;
1350               break;
1351             }
1352           /* fall down to treat '\r' as '\n' ...  */
1353
1354         case EMACS_linefeed_code:
1355           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1356             ENCODE_RESET_PLANE_AND_REGISTER (1);
1357           if (coding->eol_type == CODING_EOL_LF
1358               || coding->eol_type == CODING_EOL_AUTOMATIC)
1359             *dst++ = ISO_CODE_LF;
1360           else if (coding->eol_type == CODING_EOL_CRLF)
1361             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1362           else
1363             *dst++ = ISO_CODE_CR;
1364           break;
1365
1366         case EMACS_leading_code_2:
1367           ONE_MORE_BYTE (c2);
1368           ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2);
1369           break;
1370
1371         case EMACS_leading_code_3:
1372           TWO_MORE_BYTES (c2, c3);
1373           if (c1 < LEADING_CODE_PRIVATE_11)
1374             ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3);
1375           else
1376             ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3);
1377           break;
1378
1379         case EMACS_leading_code_4:
1380           THREE_MORE_BYTES (c2, c3, c4);
1381           ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4);
1382           break;
1383
1384         case EMACS_leading_code_composition:
1385           ONE_MORE_BYTE (c1);
1386           if (c1 == 0xFF)
1387             {
1388               coding->composing = COMPOSING_WITH_RULE_HEAD;
1389               ENCODE_COMPOSITION_WITH_RULE_START;
1390             }
1391           else
1392             {
1393               /* Rewind one byte because it is a character code of
1394                  composition elements.  */
1395               src--;
1396               coding->composing = COMPOSING_NO_RULE_HEAD;
1397               ENCODE_COMPOSITION_NO_RULE_START;
1398             }
1399           break;
1400
1401         case EMACS_invalid_code:
1402           *dst++ = c1;
1403           break;
1404         }
1405       continue;
1406     label_end_of_loop:
1407       coding->carryover_size = src - src_base;
1408       bcopy (src_base, coding->carryover, coding->carryover_size);
1409       src = src_base;
1410       break;
1411     }
1412
1413   /* If this is the last block of the text to be encoded, we must
1414      reset the state of graphic planes and registers to initial one.
1415      In addition, we had better just flush out all remaining codes in
1416      the text although they are not valid characters.  */
1417   if (coding->last_block)
1418     {
1419       ENCODE_RESET_PLANE_AND_REGISTER (1);
1420       bcopy(src, dst, src_end - src);
1421       dst += (src_end - src);
1422       src = src_end;
1423     }
1424   *consumed = src - source;
1425   return dst - destination;
1426 }
1427
1428 \f
1429 /*** 4. SJIS and BIG5 handlers ***/
1430
1431 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1432    quite widely.  So, for the moment, Emacs supports them in the bare
1433    C code.  But, in the future, they may be supported only by CCL.  */
1434
1435 /* SJIS is a coding system encoding three character sets: ASCII, right
1436    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1437    as is.  A character of charset katakana-jisx0201 is encoded by
1438    "position-code + 0x80".  A character of charset japanese-jisx0208
1439    is encoded in 2-byte but two position-codes are divided and shifted
1440    so that it fit in the range below.
1441
1442    --- CODE RANGE of SJIS ---
1443    (character set)      (range)
1444    ASCII                0x00 .. 0x7F
1445    KATAKANA-JISX0201    0xA0 .. 0xDF
1446    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1447             (2nd byte)  0x40 .. 0xFF
1448    -------------------------------
1449
1450 */
1451
1452 /* BIG5 is a coding system encoding two character sets: ASCII and
1453    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1454    character set and is encoded in two-byte.
1455
1456    --- CODE RANGE of BIG5 ---
1457    (character set)      (range)
1458    ASCII                0x00 .. 0x7F
1459    Big5 (1st byte)      0xA1 .. 0xFE
1460         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1461    --------------------------
1462
1463    Since the number of characters in Big5 is larger than maximum
1464    characters in Emacs' charset (96x96), it can't be handled as one
1465    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1466    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1467    contains frequently used characters and the latter contains less
1468    frequently used characters.  */
1469
1470 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1471    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1472    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1473    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1474
1475 /* Number of Big5 characters which have the same code in 1st byte.  */
1476 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1477
1478 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1479   do {                                                                  \
1480     unsigned int temp                                                   \
1481       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1482     if (b1 < 0xC9)                                                      \
1483       charset = charset_big5_1;                                         \
1484     else                                                                \
1485       {                                                                 \
1486         charset = charset_big5_2;                                       \
1487         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1488       }                                                                 \
1489     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1490     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1491   } while (0)
1492
1493 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1494   do {                                                                  \
1495     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1496     if (charset == charset_big5_2)                                      \
1497       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1498     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1499     b2 = temp % BIG5_SAME_ROW;                                          \
1500     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1501   } while (0)
1502
1503 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1504    Check if a text is encoded in SJIS.  If it is, return
1505    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1506
1507 int
1508 detect_coding_sjis (src, src_end)
1509      unsigned char *src, *src_end;
1510 {
1511   unsigned char c;
1512
1513   while (src < src_end)
1514     {
1515       c = *src++;
1516       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1517         return 0;
1518       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1519         {
1520           if (src < src_end && *src++ < 0x40)
1521             return 0;
1522         }
1523     }
1524   return CODING_CATEGORY_MASK_SJIS;
1525 }
1526
1527 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1528    Check if a text is encoded in BIG5.  If it is, return
1529    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1530
1531 int
1532 detect_coding_big5 (src, src_end)
1533      unsigned char *src, *src_end;
1534 {
1535   unsigned char c;
1536
1537   while (src < src_end)
1538     {
1539       c = *src++;
1540       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1541         return 0;
1542       if (c >= 0xA1)
1543         {
1544           if (src >= src_end)
1545             break;
1546           c = *src++;
1547           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1548             return 0;
1549         }
1550     }
1551   return CODING_CATEGORY_MASK_BIG5;
1552 }
1553
1554 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1555    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1556
1557 int
1558 decode_coding_sjis_big5 (coding, source, destination,
1559                          src_bytes, dst_bytes, consumed, sjis_p)
1560      struct coding_system *coding;
1561      unsigned char *source, *destination;
1562      int src_bytes, dst_bytes;
1563      int *consumed;
1564      int sjis_p;
1565 {
1566   unsigned char *src = source;
1567   unsigned char *src_end = source + src_bytes;
1568   unsigned char *dst = destination;
1569   unsigned char *dst_end = destination + dst_bytes;
1570   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1571      from DST_END to assure overflow checking is necessary only at the
1572      head of loop.  */
1573   unsigned char *adjusted_dst_end = dst_end - 3;
1574
1575   while (src < src_end && dst < adjusted_dst_end)
1576     {
1577       /* SRC_BASE remembers the start position in source in each loop.
1578          The loop will be exited when there's not enough source text
1579          to analyze two-byte character (within macro ONE_MORE_BYTE).
1580          In that case, SRC is reset to SRC_BASE before exiting.  */
1581       unsigned char *src_base = src;
1582       unsigned char c1 = *src++, c2, c3, c4;
1583
1584       if (c1 == '\r')
1585         {
1586           if (coding->eol_type == CODING_EOL_CRLF)
1587             {
1588               ONE_MORE_BYTE (c2);
1589               if (c2 == '\n')
1590                 *dst++ = c2;
1591               else
1592                 /* To process C2 again, SRC is subtracted by 1.  */
1593                 *dst++ = c1, src--;
1594             }
1595           else
1596             *dst++ = c1;
1597         }
1598       else if (c1 < 0x80)
1599         *dst++ = c1;
1600       else if (c1 < 0xA0 || c1 >= 0xE0)
1601         {
1602           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1603           if (sjis_p)
1604             {
1605               ONE_MORE_BYTE (c2);
1606               DECODE_SJIS (c1, c2, c3, c4);
1607               DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1608             }
1609           else if (c1 >= 0xE0 && c1 < 0xFF)
1610             {
1611               int charset;
1612
1613               ONE_MORE_BYTE (c2);
1614               DECODE_BIG5 (c1, c2, charset, c3, c4);
1615               DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1616             }
1617           else                  /* Invalid code */
1618             *dst++ = c1;
1619         }
1620       else
1621         {
1622           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1623           if (sjis_p)
1624             DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1625           else
1626             {
1627               int charset;
1628
1629               ONE_MORE_BYTE (c2);
1630               DECODE_BIG5 (c1, c2, charset, c3, c4);
1631               DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1632             }
1633         }
1634       continue;
1635
1636     label_end_of_loop:
1637       coding->carryover_size = src - src_base;
1638       bcopy (src_base, coding->carryover, coding->carryover_size);
1639       src = src_base;
1640       break;
1641     }
1642
1643   *consumed = src - source;
1644   return dst - destination;
1645 }
1646
1647 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1648    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1649    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1650    sure that all these charsets are registered as official charset
1651    (i.e. do not have extended leading-codes).  Characters of other
1652    charsets are produced without any encoding.  If SJIS_P is 1, encode
1653    SJIS text, else encode BIG5 text.  */
1654
1655 int
1656 encode_coding_sjis_big5 (coding, source, destination,
1657                          src_bytes, dst_bytes, consumed, sjis_p)
1658      struct coding_system *coding;
1659      unsigned char *source, *destination;
1660      int src_bytes, dst_bytes;
1661      int *consumed;
1662      int sjis_p;
1663 {
1664   unsigned char *src = source;
1665   unsigned char *src_end = source + src_bytes;
1666   unsigned char *dst = destination;
1667   unsigned char *dst_end = destination + dst_bytes;
1668   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1669      from DST_END to assure overflow checking is necessary only at the
1670      head of loop.  */
1671   unsigned char *adjusted_dst_end = dst_end - 1;
1672
1673   while (src < src_end && dst < adjusted_dst_end)
1674     {
1675       /* SRC_BASE remembers the start position in source in each loop.
1676          The loop will be exited when there's not enough source text
1677          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1678          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1679          before exiting.  */
1680       unsigned char *src_base = src;
1681       unsigned char c1 = *src++, c2, c3, c4;
1682
1683       if (coding->composing)
1684         {
1685           if (c1 == 0xA0)
1686             {
1687               ONE_MORE_BYTE (c1);
1688               c1 &= 0x7F;
1689             }
1690           else if (c1 >= 0xA0)
1691             c1 -= 0x20;
1692           else
1693             coding->composing = 0;
1694         }
1695
1696       switch (emacs_code_class[c1])
1697         {
1698         case EMACS_ascii_code:
1699         case EMACS_control_code:
1700           *dst++ = c1;
1701           break;
1702
1703         case EMACS_carriage_return_code:
1704           if (!coding->selective)
1705             {
1706               *dst++ = c1;
1707               break;
1708             }
1709           /* fall down to treat '\r' as '\n' ...  */
1710
1711         case EMACS_linefeed_code:
1712           if (coding->eol_type == CODING_EOL_LF
1713               || coding->eol_type == CODING_EOL_AUTOMATIC)
1714             *dst++ = '\n';
1715           else if (coding->eol_type == CODING_EOL_CRLF)
1716             *dst++ = '\r', *dst++ = '\n';
1717           else
1718             *dst++ = '\r';
1719           break;
1720
1721         case EMACS_leading_code_2:
1722           ONE_MORE_BYTE (c2);
1723           if (sjis_p && c1 == charset_katakana_jisx0201)
1724             *dst++ = c2;
1725           else
1726             *dst++ = c1, *dst++ = c2;
1727           break;
1728
1729         case EMACS_leading_code_3:
1730           TWO_MORE_BYTES (c2, c3);
1731           c2 &= 0x7F, c3 &= 0x7F;
1732           if (sjis_p && c1 == charset_jisx0208)
1733             {
1734               unsigned char s1, s2;
1735
1736               ENCODE_SJIS (c2, c3, s1, s2);
1737               *dst++ = s1, *dst++ = s2;
1738             }
1739           else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1740             {
1741               unsigned char b1, b2;
1742
1743               ENCODE_BIG5 (c1, c2, c3, b1, b2);
1744               *dst++ = b1, *dst++ = b2;
1745             }
1746           else
1747             *dst++ = c1, *dst++ = c2, *dst++ = c3;
1748           break;
1749
1750         case EMACS_leading_code_4:
1751           THREE_MORE_BYTES (c2, c3, c4);
1752           *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1753           break;
1754
1755         case EMACS_leading_code_composition:
1756           coding->composing = 1;
1757           break;
1758
1759         default:                /* i.e. case EMACS_invalid_code: */
1760           *dst++ = c1;
1761         }
1762       continue;
1763
1764     label_end_of_loop:
1765       coding->carryover_size = src - src_base;
1766       bcopy (src_base, coding->carryover, coding->carryover_size);
1767       src = src_base;
1768       break;
1769     }
1770
1771   *consumed = src - source;
1772   return dst - destination;
1773 }
1774
1775 \f
1776 /*** 5. End-of-line handlers ***/
1777
1778 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1779    This function is called only when `coding->eol_type' is
1780    CODING_EOL_CRLF or CODING_EOL_CR.  */
1781
1782 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1783      struct coding_system *coding;
1784      unsigned char *source, *destination;
1785      int src_bytes, dst_bytes;
1786      int *consumed;
1787 {
1788   unsigned char *src = source;
1789   unsigned char *src_end = source + src_bytes;
1790   unsigned char *dst = destination;
1791   unsigned char *dst_end = destination + dst_bytes;
1792   int produced;
1793
1794   switch (coding->eol_type)
1795     {
1796     case CODING_EOL_CRLF:
1797       {
1798         /* Since the maximum bytes produced by each loop is 2, we
1799            subtract 1 from DST_END to assure overflow checking is
1800            necessary only at the head of loop.  */
1801         unsigned char *adjusted_dst_end = dst_end - 1;
1802
1803         while (src < src_end && dst < adjusted_dst_end)
1804           {
1805             unsigned char *src_base = src;
1806             unsigned char c = *src++;
1807             if (c == '\r')
1808               {
1809                 ONE_MORE_BYTE (c);
1810                 if (c != '\n')
1811                   *dst++ = '\r';
1812
1813               }
1814             else
1815               *dst++ = c;
1816             continue;
1817
1818           label_end_of_loop:
1819             coding->carryover_size = src - src_base;
1820             bcopy (src_base, coding->carryover, coding->carryover_size);
1821             src = src_base;
1822             break;
1823           }
1824         *consumed = src - source;
1825         produced = dst - destination;
1826         break;
1827       }
1828
1829     case CODING_EOL_CR:
1830       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1831       bcopy (source, destination, produced);
1832       dst_end = destination + produced;
1833       while (dst < dst_end)
1834         if (*dst++ == '\r') dst[-1] = '\n';
1835       *consumed = produced;
1836       break;
1837
1838     default:                    /* i.e. case: CODING_EOL_LF */
1839       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1840       bcopy (source, destination, produced);
1841       *consumed = produced;
1842       break;
1843     }
1844
1845   return produced;
1846 }
1847
1848 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
1849    format of end-of-line according to `coding->eol_type'.  If
1850    `coding->selective' is 1, code '\r' in source text also means
1851    end-of-line.  */
1852
1853 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1854      struct coding_system *coding;
1855      unsigned char *source, *destination;
1856      int src_bytes, dst_bytes;
1857      int *consumed;
1858 {
1859   unsigned char *src = source;
1860   unsigned char *dst = destination;
1861   int produced;
1862
1863   if (src_bytes <= 0)
1864     return 0;
1865
1866   switch (coding->eol_type)
1867     {
1868     case CODING_EOL_LF:
1869     case CODING_EOL_AUTOMATIC:
1870       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1871       bcopy (source, destination, produced);
1872       if (coding->selective)
1873         {
1874           int i = produced;
1875           while (i--)
1876             if (*dst++ == '\r') dst[-1] = '\n';
1877         }
1878       *consumed = produced;
1879
1880     case CODING_EOL_CRLF:
1881       {
1882         unsigned char c;
1883         unsigned char *src_end = source + src_bytes;
1884         unsigned char *dst_end = destination + dst_bytes;
1885         /* Since the maximum bytes produced by each loop is 2, we
1886            subtract 1 from DST_END to assure overflow checking is
1887            necessary only at the head of loop.  */
1888         unsigned char *adjusted_dst_end = dst_end - 1;
1889
1890         while (src < src_end && dst < adjusted_dst_end)
1891           {
1892             c = *src++;
1893             if (c == '\n' || (c == '\r' && coding->selective))
1894               *dst++ = '\r', *dst++ = '\n';
1895             else
1896               *dst++ = c;
1897           }
1898         produced = dst - destination;
1899         *consumed = src - source;
1900         break;
1901       }
1902
1903     default:                    /* i.e. case CODING_EOL_CR: */
1904       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1905       bcopy (source, destination, produced);
1906       {
1907         int i = produced;
1908         while (i--)
1909           if (*dst++ == '\n') dst[-1] = '\r';
1910       }
1911       *consumed = produced;
1912     }
1913
1914   return produced;
1915 }
1916
1917 \f
1918 /*** 6. C library functions ***/
1919
1920 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
1921    has a property `coding-system'.  The value of this property is a
1922    vector of length 5 (called as coding-vector).  Among elements of
1923    this vector, the first (element[0]) and the fifth (element[4])
1924    carry important information for decoding/encoding.  Before
1925    decoding/encoding, this information should be set in fields of a
1926    structure of type `coding_system'.
1927
1928    A value of property `coding-system' can be a symbol of another
1929    subsidiary coding-system.  In that case, Emacs gets coding-vector
1930    from that symbol.
1931
1932    `element[0]' contains information to be set in `coding->type'.  The
1933    value and its meaning is as follows:
1934
1935    0 -- coding_system_internal
1936    1 -- coding_system_sjis
1937    2 -- coding_system_iso2022
1938    3 -- coding_system_big5
1939    4 -- coding_system_ccl
1940    nil -- coding_system_no_conversion
1941    t -- coding_system_automatic
1942
1943    `element[4]' contains information to be set in `coding->flags' and
1944    `coding->spec'.  The meaning varies by `coding->type'.
1945
1946    If `coding->type' is `coding_type_iso2022', element[4] is a vector
1947    of length 32 (of which the first 13 sub-elements are used now).
1948    Meanings of these sub-elements are:
1949
1950    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
1951         If the value is an integer of valid charset, the charset is
1952         assumed to be designated to graphic register N initially.
1953
1954         If the value is minus, it is a minus value of charset which
1955         reserves graphic register N, which means that the charset is
1956         not designated initially but should be designated to graphic
1957         register N just before encoding a character in that charset.
1958
1959         If the value is nil, graphic register N is never used on
1960         encoding.
1961
1962    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
1963         Each value takes t or nil.  See the section ISO2022 of
1964         `coding.h' for more information.
1965
1966    If `coding->type' is `coding_type_big5', element[4] is t to denote
1967    BIG5-ETen or nil to denote BIG5-HKU.
1968
1969    If `coding->type' takes the other value, element[4] is ignored.
1970
1971    Emacs Lisp's coding system also carries information about format of
1972    end-of-line in a value of property `eol-type'.  If the value is
1973    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
1974    means CODING_EOL_CR.  If it is not integer, it should be a vector
1975    of subsidiary coding systems of which property `eol-type' has one
1976    of above values.
1977
1978 */
1979
1980 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
1981    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
1982    is setup so that no conversion is necessary and return -1, else
1983    return 0.  */
1984
1985 int
1986 setup_coding_system (coding_system_symbol, coding)
1987      Lisp_Object coding_system_symbol;
1988      struct coding_system *coding;
1989 {
1990   Lisp_Object coding_system_vector = Qnil;
1991   Lisp_Object type, eol_type;
1992
1993   /* At first, set several fields default values.  */
1994   coding->require_flushing = 0;
1995   coding->last_block = 0;
1996   coding->selective = 0;
1997   coding->composing = 0;
1998   coding->direction = 0;
1999   coding->carryover_size = 0;
2000   coding->symbol = Qnil;
2001   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2002
2003   /* Get value of property `coding-system'.  If it is a Lisp symbol
2004      pointing another coding system, fetch its property until we get a
2005      vector.  */
2006   while (!NILP (coding_system_symbol))
2007     {
2008       coding->symbol = coding_system_symbol;
2009       if (NILP (coding->post_read_conversion))
2010         coding->post_read_conversion = Fget (coding_system_symbol,
2011                                              Qpost_read_conversion);
2012       if (NILP (coding->pre_write_conversion))
2013         coding->pre_write_conversion = Fget (coding_system_symbol,
2014                                              Qpre_write_conversion);
2015
2016       coding_system_vector = Fget (coding_system_symbol, Qcoding_system);
2017       if (VECTORP (coding_system_vector))
2018         break;
2019       coding_system_symbol = coding_system_vector;
2020     }
2021   Vlast_coding_system_used = coding->symbol;
2022
2023   if (!VECTORP (coding_system_vector)
2024       || XVECTOR (coding_system_vector)->size != 5)
2025     goto label_invalid_coding_system;
2026
2027   /* Get value of property `eol-type' by searching from the root
2028      coding-system.  */
2029   coding_system_symbol = coding->symbol;
2030   eol_type = Qnil;
2031   while (SYMBOLP (coding_system_symbol) && !NILP (coding_system_symbol))
2032     {
2033       eol_type = Fget (coding_system_symbol, Qeol_type);
2034       if (!NILP (eol_type))
2035         break;
2036       coding_system_symbol = Fget (coding_system_symbol, Qcoding_system);
2037     }
2038
2039   if (VECTORP (eol_type))
2040     coding->eol_type = CODING_EOL_AUTOMATIC;
2041   else if (XFASTINT (eol_type) == 1)
2042     coding->eol_type = CODING_EOL_CRLF;
2043   else if (XFASTINT (eol_type) == 2)
2044     coding->eol_type = CODING_EOL_CR;
2045   else
2046     coding->eol_type = CODING_EOL_LF;
2047
2048   type = XVECTOR (coding_system_vector)->contents[0];
2049   switch (XFASTINT (type))
2050     {
2051     case 0:
2052       coding->type = coding_type_internal;
2053       break;
2054
2055     case 1:
2056       coding->type = coding_type_sjis;
2057       break;
2058
2059     case 2:
2060       coding->type = coding_type_iso2022;
2061       {
2062         Lisp_Object val = XVECTOR (coding_system_vector)->contents[4];
2063         Lisp_Object *flags;
2064         int i, charset, default_reg_bits = 0;
2065
2066         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2067           goto label_invalid_coding_system;
2068
2069         flags = XVECTOR (val)->contents;
2070         coding->flags
2071           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2072              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2073              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2074              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2075              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2076              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2077              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2078              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2079              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION));
2080
2081         /* Invoke graphic register 0 to plane 0.  */
2082         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2083         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2084         CODING_SPEC_ISO_INVOCATION (coding, 1)
2085           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2086         /* Not single shifting at first.  */
2087         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2088
2089         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2090            FLAGS[REG] can be one of below:
2091                 integer CHARSET: CHARSET occupies register I,
2092                 t: designate nothing to REG initially, but can be used
2093                   by any charsets,
2094                 list of integer, nil, or t: designate the first
2095                   element (if integer) to REG initially, the remaining
2096                   elements (if integer) is designated to REG on request,
2097                   if an element is t, REG can be used by any charset,
2098                 nil: REG is never used.  */
2099         for (charset = 0; charset < MAX_CHARSET; charset++)
2100           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2101         for (i = 0; i < 4; i++)
2102           {
2103             if (INTEGERP (flags[i])
2104                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
2105               {
2106                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2107                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2108               }
2109             else if (EQ (flags[i], Qt))
2110               {
2111                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2112                 default_reg_bits |= 1 << i;
2113               }
2114             else if (CONSP (flags[i]))
2115               {
2116                 Lisp_Object tail = flags[i];
2117
2118                 if (INTEGERP (XCONS (tail)->car)
2119                     && (charset = XINT (XCONS (tail)->car),
2120                         CHARSET_VALID_P (charset)))
2121                   {
2122                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2123                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2124                   }
2125                 else
2126                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2127                 tail = XCONS (tail)->cdr;
2128                 while (CONSP (tail))
2129                   {
2130                     if (INTEGERP (XCONS (tail)->car)
2131                         && (charset = XINT (XCONS (tail)->car),
2132                             CHARSET_VALID_P (charset)))
2133                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2134                         = i;
2135                     else if (EQ (XCONS (tail)->car, Qt))
2136                       default_reg_bits |= 1 << i;
2137                     tail = XCONS (tail)->cdr;
2138                   }
2139               }
2140             else
2141               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2142
2143             CODING_SPEC_ISO_DESIGNATION (coding, i)
2144               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2145           }
2146
2147         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2148           {
2149             /* REG 1 can be used only by locking shift in 7-bit env.  */
2150             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2151               default_reg_bits &= ~2;
2152             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2153               /* Without any shifting, only REG 0 and 1 can be used.  */
2154               default_reg_bits &= 3;
2155           }
2156
2157         for (charset = 0; charset < MAX_CHARSET; charset++)
2158           if (CHARSET_VALID_P (charset)
2159               && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2160             {
2161               /* We have not yet decided where to designate CHARSET.  */
2162               int reg_bits = default_reg_bits;
2163
2164               if (CHARSET_CHARS (charset) == 96)
2165                 /* A charset of CHARS96 can't be designated to REG 0.  */
2166                 reg_bits &= ~1;
2167
2168               if (reg_bits)
2169                 /* There exist some default graphic register.  */
2170                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2171                   = (reg_bits & 1
2172                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2173               else
2174                 /* We anyway have to designate CHARSET to somewhere.  */
2175                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2176                   = (CHARSET_CHARS (charset) == 94
2177                      ? 0
2178                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2179                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2180                         ? 1
2181                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2182                            ? 2 : 0)));
2183             }
2184       }
2185       coding->require_flushing = 1;
2186       break;
2187
2188     case 3:
2189       coding->type = coding_type_big5;
2190       coding->flags
2191         = (NILP (XVECTOR (coding_system_vector)->contents[4])
2192            ? CODING_FLAG_BIG5_HKU
2193            : CODING_FLAG_BIG5_ETEN);
2194       break;
2195
2196     case 4:
2197       coding->type = coding_type_ccl;
2198       {
2199         Lisp_Object val = XVECTOR (coding_system_vector)->contents[4];
2200         if (CONSP  (val)
2201             && VECTORP (XCONS (val)->car)
2202             && VECTORP (XCONS (val)->cdr))
2203           {
2204             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2205             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2206           }
2207         else
2208           goto label_invalid_coding_system;
2209       }
2210       coding->require_flushing = 1;
2211       break;
2212
2213     default:
2214       if (EQ (type, Qt))
2215         coding->type = coding_type_automatic;
2216       else
2217         coding->type = coding_type_no_conversion;
2218       break;
2219     }
2220   return 0;
2221
2222  label_invalid_coding_system:
2223   coding->type = coding_type_no_conversion;
2224   return -1;
2225 }
2226
2227 /* Emacs has a mechanism to automatically detect a coding system if it
2228    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2229    it's impossible to distinguish some coding systems accurately
2230    because they use the same range of codes.  So, at first, coding
2231    systems are categorized into 7, those are:
2232
2233    o coding-category-internal
2234
2235         The category for a coding system which has the same code range
2236         as Emacs' internal format.  Assigned the coding-system (Lisp
2237         symbol) `coding-system-internal' by default.
2238
2239    o coding-category-sjis
2240
2241         The category for a coding system which has the same code range
2242         as SJIS.  Assigned the coding-system (Lisp
2243         symbol) `coding-system-sjis' by default.
2244
2245    o coding-category-iso-7
2246
2247         The category for a coding system which has the same code range
2248         as ISO2022 of 7-bit environment.  Assigned the coding-system
2249         (Lisp symbol) `coding-system-junet' by default.
2250
2251    o coding-category-iso-8-1
2252
2253         The category for a coding system which has the same code range
2254         as ISO2022 of 8-bit environment and graphic plane 1 used only
2255         for DIMENSION1 charset.  Assigned the coding-system (Lisp
2256         symbol) `coding-system-ctext' by default.
2257
2258    o coding-category-iso-8-2
2259
2260         The category for a coding system which has the same code range
2261         as ISO2022 of 8-bit environment and graphic plane 1 used only
2262         for DIMENSION2 charset.  Assigned the coding-system (Lisp
2263         symbol) `coding-system-euc-japan' by default.
2264
2265    o coding-category-iso-else
2266
2267         The category for a coding system which has the same code range
2268         as ISO2022 but not belongs to any of the above three
2269         categories.  Assigned the coding-system (Lisp symbol)
2270         `coding-system-iso-2022-ss2-7' by default.
2271
2272    o coding-category-big5
2273
2274         The category for a coding system which has the same code range
2275         as BIG5.  Assigned the coding-system (Lisp symbol)
2276         `coding-system-big5' by default.
2277
2278    o coding-category-binary
2279
2280         The category for a coding system not categorized in any of the
2281         above.  Assigned the coding-system (Lisp symbol)
2282         `coding-system-noconv' by default.
2283
2284    Each of them is a Lisp symbol and the value is an actual
2285    `coding-system's (this is also a Lisp symbol) assigned by a user.
2286    What Emacs does actually is to detect a category of coding system.
2287    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2288    decide only one possible category, it selects a category of the
2289    highest priority.  Priorities of categories are also specified by a
2290    user in a Lisp variable `coding-category-list'.
2291
2292 */
2293
2294 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2295    If it detects possible coding systems, return an integer in which
2296    appropriate flag bits are set.  Flag bits are defined by macros
2297    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2298
2299 int
2300 detect_coding_mask (src, src_bytes)
2301      unsigned char *src;
2302      int src_bytes;
2303 {
2304   register unsigned char c;
2305   unsigned char *src_end = src + src_bytes;
2306   int mask;
2307
2308   /* At first, skip all ASCII characters and control characters except
2309      for three ISO2022 specific control characters.  */
2310   while (src < src_end)
2311     {
2312       c = *src;
2313       if (c >= 0x80
2314           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2315         break;
2316       src++;
2317     }
2318
2319   if (src >= src_end)
2320     /* We found nothing other than ASCII.  There's nothing to do.  */
2321     return CODING_CATEGORY_MASK_ANY;
2322
2323   /* The text seems to be encoded in some multilingual coding system.
2324      Now, try to find in which coding system the text is encoded.  */
2325   if (c < 0x80)
2326     /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2327     /* C is an ISO2022 specific control code of C0.  */
2328     mask = detect_coding_iso2022 (src, src_end);
2329
2330   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2331     /* C is an ISO2022 specific control code of C1,
2332        or the first byte of SJIS's 2-byte character code,
2333        or a leading code of Emacs.  */
2334     mask = (detect_coding_iso2022 (src, src_end)
2335             | detect_coding_sjis (src, src_end)
2336             | detect_coding_internal (src, src_end));
2337
2338   else if (c < 0xA0)
2339     /* C is the first byte of SJIS character code,
2340        or a leading-code of Emacs.  */
2341     mask = (detect_coding_sjis (src, src_end)
2342             | detect_coding_internal (src, src_end));
2343
2344   else
2345     /* C is a character of ISO2022 in graphic plane right,
2346        or a SJIS's 1-byte character code (i.e. JISX0201),
2347        or the first byte of BIG5's 2-byte code.  */
2348     mask = (detect_coding_iso2022 (src, src_end)
2349             | detect_coding_sjis (src, src_end)
2350             | detect_coding_big5 (src, src_end));
2351
2352   return mask;
2353 }
2354
2355 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2356    The information of the detected coding system is set in CODING.  */
2357
2358 void
2359 detect_coding (coding, src, src_bytes)
2360      struct coding_system *coding;
2361      unsigned char *src;
2362      int src_bytes;
2363 {
2364   int mask = detect_coding_mask (src, src_bytes);
2365   int idx;
2366
2367   if (mask == CODING_CATEGORY_MASK_ANY)
2368     /* We found nothing other than ASCII.  There's nothing to do.  */
2369     return;
2370
2371   if (!mask)
2372     /* The source text seems to be encoded in unknown coding system.
2373        Emacs regards the category of such a kind of coding system as
2374        `coding-category-binary'.  We assume that a user has assigned
2375        an appropriate coding system for a `coding-category-binary'.  */
2376     idx = CODING_CATEGORY_IDX_BINARY;
2377   else
2378     {
2379       /* We found some plausible coding systems.  Let's use a coding
2380          system of the highest priority.  */
2381       Lisp_Object val = Vcoding_category_list;
2382
2383       if (CONSP (val))
2384         while (!NILP (val))
2385           {
2386             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2387             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2388               break;
2389             val = XCONS (val)->cdr;
2390           }
2391       else
2392         val = Qnil;
2393
2394       if (NILP (val))
2395         {
2396           /* For unknown reason, `Vcoding_category_list' contains none
2397              of found categories.  Let's use any of them.  */
2398           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2399             if (mask & (1 << idx))
2400               break;
2401         }
2402     }
2403   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2404 }
2405
2406 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2407    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2408    CODING_EOL_CR, and CODING_EOL_AUTOMATIC.  */
2409
2410 int
2411 detect_eol_type (src, src_bytes)
2412      unsigned char *src;
2413      int src_bytes;
2414 {
2415   unsigned char *src_end = src + src_bytes;
2416   unsigned char c;
2417
2418   while (src < src_end)
2419     {
2420       c = *src++;
2421       if (c == '\n')
2422         return CODING_EOL_LF;
2423       else if (c == '\r')
2424         {
2425           if (src < src_end && *src == '\n')
2426             return CODING_EOL_CRLF;
2427           else
2428             return CODING_EOL_CR;
2429         }
2430     }
2431   return CODING_EOL_AUTOMATIC;
2432 }
2433
2434 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2435    is encoded.  If it detects an appropriate format of end-of-line, it
2436    sets the information in *CODING.  */
2437
2438 void
2439 detect_eol (coding, src, src_bytes)
2440      struct coding_system *coding;
2441      unsigned char *src;
2442      int src_bytes;
2443 {
2444   Lisp_Object val;
2445   int eol_type = detect_eol_type (src, src_bytes);
2446
2447   if (eol_type == CODING_EOL_AUTOMATIC)
2448     /*  We found no end-of-line in the source text.  */
2449     return;
2450
2451   val = Fget (coding->symbol, Qeol_type);
2452   if (VECTORP (val) && XVECTOR (val)->size == 3)
2453     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2454 }
2455
2456 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2457    decoding, it may detect coding system and format of end-of-line if
2458    those are not yet decided.  */
2459
2460 int
2461 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2462      struct coding_system *coding;
2463      unsigned char *source, *destination;
2464      int src_bytes, dst_bytes;
2465      int *consumed;
2466 {
2467   int produced;
2468
2469   if (src_bytes <= 0)
2470     {
2471       *consumed = 0;
2472       return 0;
2473     }
2474
2475   if (coding->type == coding_type_automatic)
2476     detect_coding (coding, source, src_bytes);
2477
2478   if (coding->eol_type == CODING_EOL_AUTOMATIC)
2479     detect_eol (coding, source, src_bytes);
2480
2481   coding->carryover_size = 0;
2482   switch (coding->type)
2483     {
2484     case coding_type_no_conversion:
2485     label_no_conversion:
2486       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2487       bcopy (source, destination, produced);
2488       *consumed = produced;
2489       break;
2490
2491     case coding_type_internal:
2492     case coding_type_automatic:
2493       if (coding->eol_type == CODING_EOL_LF
2494           ||  coding->eol_type == CODING_EOL_AUTOMATIC)
2495         goto label_no_conversion;
2496       produced = decode_eol (coding, source, destination,
2497                              src_bytes, dst_bytes, consumed);
2498       break;
2499
2500     case coding_type_sjis:
2501       produced = decode_coding_sjis_big5 (coding, source, destination,
2502                                           src_bytes, dst_bytes, consumed,
2503                                           1);
2504       break;
2505
2506     case coding_type_iso2022:
2507       produced = decode_coding_iso2022 (coding, source, destination,
2508                                         src_bytes, dst_bytes, consumed);
2509       break;
2510
2511     case coding_type_big5:
2512       produced = decode_coding_sjis_big5 (coding, source, destination,
2513                                           src_bytes, dst_bytes, consumed,
2514                                           0);
2515       break;
2516
2517     case coding_type_ccl:
2518       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2519                              src_bytes, dst_bytes, consumed);
2520       break;
2521     }
2522
2523   return produced;
2524 }
2525
2526 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2527
2528 int
2529 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2530      struct coding_system *coding;
2531      unsigned char *source, *destination;
2532      int src_bytes, dst_bytes;
2533      int *consumed;
2534 {
2535   int produced;
2536
2537   coding->carryover_size = 0;
2538   switch (coding->type)
2539     {
2540     case coding_type_no_conversion:
2541     label_no_conversion:
2542       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2543       if (produced > 0)
2544         {
2545           bcopy (source, destination, produced);
2546           if (coding->selective)
2547             {
2548               unsigned char *p = destination, *pend = destination + produced;
2549               while (p < pend)
2550                 if (*p++ = '\015') p[-1] = '\n';
2551             }
2552         }
2553       *consumed = produced;
2554       break;
2555
2556     case coding_type_internal:
2557     case coding_type_automatic:
2558       if (coding->eol_type == CODING_EOL_LF
2559           ||  coding->eol_type == CODING_EOL_AUTOMATIC)
2560         goto label_no_conversion;
2561       produced = encode_eol (coding, source, destination,
2562                              src_bytes, dst_bytes, consumed);
2563       break;
2564
2565     case coding_type_sjis:
2566       produced = encode_coding_sjis_big5 (coding, source, destination,
2567                                           src_bytes, dst_bytes, consumed,
2568                                           1);
2569       break;
2570
2571     case coding_type_iso2022:
2572       produced = encode_coding_iso2022 (coding, source, destination,
2573                                         src_bytes, dst_bytes, consumed);
2574       break;
2575
2576     case coding_type_big5:
2577       produced = encode_coding_sjis_big5 (coding, source, destination,
2578                                           src_bytes, dst_bytes, consumed,
2579                                           0);
2580       break;
2581
2582     case coding_type_ccl:
2583       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2584                              src_bytes, dst_bytes, consumed);
2585       break;
2586     }
2587
2588   return produced;
2589 }
2590
2591 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2592
2593 /* Return maximum size (bytes) of a buffer enough for decoding
2594    SRC_BYTES of text encoded in CODING.  */
2595
2596 int
2597 decoding_buffer_size (coding, src_bytes)
2598      struct coding_system *coding;
2599      int src_bytes;
2600 {
2601   int magnification;
2602
2603   if (coding->type == coding_type_iso2022)
2604     magnification = 3;
2605   else if (coding->type == coding_type_ccl)
2606     magnification = coding->spec.ccl.decoder.buf_magnification;
2607   else
2608     magnification = 2;
2609
2610   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2611 }
2612
2613 /* Return maximum size (bytes) of a buffer enough for encoding
2614    SRC_BYTES of text to CODING.  */
2615
2616 int
2617 encoding_buffer_size (coding, src_bytes)
2618      struct coding_system *coding;
2619      int src_bytes;
2620 {
2621   int magnification;
2622
2623   if (coding->type == coding_type_ccl)
2624     magnification = coding->spec.ccl.encoder.buf_magnification;
2625   else
2626     magnification = 3;
2627
2628   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2629 }
2630
2631 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2632 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2633 #endif
2634
2635 char *conversion_buffer;
2636 int conversion_buffer_size;
2637
2638 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2639    or decoding.  Sufficient memory is allocated automatically.  If we
2640    run out of memory, return NULL.  */
2641
2642 char *
2643 get_conversion_buffer (size)
2644      int size;
2645 {
2646   if (size > conversion_buffer_size)
2647     {
2648       char *buf;
2649       int real_size = conversion_buffer_size * 2;
2650
2651       while (real_size < size) real_size *= 2;
2652       buf = (char *) xmalloc (real_size);
2653       xfree (conversion_buffer);
2654       conversion_buffer = buf;
2655       conversion_buffer_size = real_size;
2656     }
2657   return conversion_buffer;
2658 }
2659
2660 \f
2661 #ifdef emacs
2662 /*** 7. Emacs Lisp library functions ***/
2663
2664 DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2665        1, 1, 0,
2666   "Return coding-vector of CODING-SYSTEM.\n\
2667 If CODING-SYSTEM is not a valid coding-system, return nil.")
2668   (obj)
2669      Lisp_Object obj;
2670 {
2671   while (SYMBOLP (obj) && !NILP (obj))
2672     obj = Fget (obj, Qcoding_system);
2673   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2674           ? Qnil : obj);
2675 }
2676
2677 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2678   "Return t if OBJECT is nil or a coding-system.\n\
2679 See document of make-coding-system for coding-system object.")
2680   (obj)
2681      Lisp_Object obj;
2682 {
2683   return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2684 }
2685
2686 DEFUN ("read-non-nil-coding-system",
2687        Fread_non_nil_coding_system, Sread_non_nil_coding_system, 1, 1, 0,
2688   "Read a coding-system from the minibuffer, prompting with string PROMPT.")
2689   (prompt)
2690      Lisp_Object prompt;
2691 {
2692   return Fintern (Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2693                                     Qt, Qnil, Qnil),
2694                   Qnil);
2695 }
2696
2697 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2698   "Read a coding-system or nil from the minibuffer, prompting with string PROMPT.")
2699   (prompt)
2700      Lisp_Object prompt;
2701 {
2702   return Fintern (Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2703                                     Qt, Qnil, Qnil),
2704                   Qnil);
2705 }
2706
2707 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2708        1, 1, 0,
2709   "Check validity of CODING-SYSTEM.\n\
2710 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2711 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2712 The value of property should be a vector of length 5.")
2713   (coding_system)
2714      Lisp_Object coding_system;
2715 {
2716   CHECK_SYMBOL (coding_system, 0);
2717   if (!NILP (Fcoding_system_p (coding_system)))
2718     return coding_system;
2719   while (1)
2720     Fsignal (Qcoding_system_error, coding_system);
2721 }
2722
2723 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2724        2, 2, 0,
2725   "Detect coding-system of the text in the region between START and END.\n\
2726 Return a list of possible coding-systems ordered by priority.\n\
2727 If only ASCII characters are found, it returns `coding-system-automatic'\n\
2728  or its subsidiary coding-system according to a detected end-of-line format.")
2729   (b, e)
2730      Lisp_Object b, e;
2731 {
2732   int coding_mask, eol_type;
2733   Lisp_Object val;
2734   int beg, end;
2735
2736   validate_region (&b, &e);
2737   beg = XINT (b), end = XINT (e);
2738   if (beg < GPT && end >= GPT) move_gap (end);
2739
2740   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2741   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
2742
2743   if (coding_mask == CODING_CATEGORY_MASK_ANY)
2744     {
2745       val = intern ("coding-system-automatic");
2746       if (eol_type != CODING_EOL_AUTOMATIC)
2747         {
2748           Lisp_Object val2 = Fget (val, Qeol_type);
2749           if (VECTORP (val2))
2750             val = XVECTOR (val2)->contents[eol_type];
2751         }
2752     }
2753   else
2754     {
2755       Lisp_Object val2;
2756
2757       /* At first, gather possible coding-systems in VAL in a reverse
2758          order.  */
2759       val = Qnil;
2760       for (val2 = Vcoding_category_list;
2761            !NILP (val2);
2762            val2 = XCONS (val2)->cdr)
2763         {
2764           int idx
2765             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2766           if (coding_mask & (1 << idx))
2767             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2768         }
2769
2770       /* Then, change the order of the list, while getting subsidiary
2771          coding-systems.  */
2772       val2 = val;
2773       val = Qnil;
2774       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2775         {
2776           if (eol_type == CODING_EOL_AUTOMATIC)
2777             val = Fcons (XCONS (val2)->car, val);
2778           else
2779             {
2780               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2781               if (VECTORP (val3))
2782                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2783               else
2784                 val = Fcons (XCONS (val2)->car, val);
2785             }
2786         }
2787     }
2788
2789   return val;
2790 }
2791
2792 /* Scan text in the region between *BEGP and *ENDP, skip characters
2793    which we never have to encode to (iff ENCODEP is 1) or decode from
2794    coding system CODING at the head and tail, then set BEGP and ENDP
2795    to the addresses of start and end of the text we actually convert.  */
2796
2797 void
2798 shrink_conversion_area (begp, endp, coding, encodep)
2799      unsigned char **begp, **endp;
2800      struct coding_system *coding;
2801      int encodep;
2802 {
2803   register unsigned char *beg_addr = *begp, *end_addr = *endp;
2804
2805   if (coding->eol_type != CODING_EOL_LF
2806       && coding->eol_type != CODING_EOL_AUTOMATIC)
2807     /* Since we anyway have to convert end-of-line format, it is not
2808        worth skipping at most 100 bytes or so.  */
2809     return;
2810
2811   if (encodep)                  /* for encoding */
2812     {
2813       switch (coding->type)
2814         {
2815         case coding_type_no_conversion:
2816         case coding_type_internal:
2817         case coding_type_automatic:
2818           /* We need no conversion.  */
2819           *begp = *endp;
2820           return;
2821         case coding_type_ccl:
2822           /* We can't skip any data.  */
2823           return;
2824         default:
2825           /* We can skip all ASCII characters at the head and tail.  */
2826           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2827           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2828           break;
2829         }
2830     }
2831   else                          /* for decoding */
2832     {
2833       switch (coding->type)
2834         {
2835         case coding_type_no_conversion:
2836           /* We need no conversion.  */
2837           *begp = *endp;
2838           return;
2839         case coding_type_internal:
2840           if (coding->eol_type == CODING_EOL_LF)
2841             {
2842               /* We need no conversion.  */
2843               *begp = *endp;
2844               return;
2845             }
2846           /* We can skip all but carriage-return.  */
2847           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2848           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2849           break;
2850         case coding_type_sjis:
2851         case coding_type_big5:
2852           /* We can skip all ASCII characters at the head.  */
2853           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2854           /* We can skip all ASCII characters at the tail except for
2855              the second byte of SJIS or BIG5 code.  */
2856           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2857           if (end_addr != *endp)
2858             end_addr++;
2859           break;
2860         case coding_type_ccl:
2861           /* We can't skip any data.  */
2862           return;
2863         default:                /* i.e. case coding_type_iso2022: */
2864           {
2865             unsigned char c;
2866
2867             /* We can skip all ASCII characters except for a few
2868                control codes at the head.  */
2869             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2870                    && c != ISO_CODE_CR && c != ISO_CODE_SO
2871                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
2872               beg_addr++;
2873           }
2874           break;
2875         }
2876     }
2877   *begp = beg_addr;
2878   *endp = end_addr;
2879   return;
2880 }
2881
2882 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
2883    text between B and E.  B and E are buffer position.  */
2884
2885 Lisp_Object
2886 code_convert_region (b, e, coding, encodep)
2887      Lisp_Object b, e;
2888      struct coding_system *coding;
2889      int encodep;
2890 {
2891   int beg, end, len, consumed, produced;
2892   char *buf;
2893   unsigned char *begp, *endp;
2894   int pos = PT;
2895
2896   validate_region (&b, &e);
2897   beg = XINT (b), end = XINT (e);
2898   if (beg < GPT && end >= GPT)
2899     move_gap (end);
2900
2901   if (encodep && !NILP (coding->pre_write_conversion))
2902     {
2903       /* We must call a pre-conversion function which may put a new
2904          text to be converted in a new buffer.  */
2905       struct buffer *old = current_buffer, *new;
2906
2907       TEMP_SET_PT (beg);
2908       call2 (coding->pre_write_conversion, b, e);
2909       if (old != current_buffer)
2910         {
2911           /* Replace the original text by the text just generated.  */
2912           len = ZV - BEGV;
2913           new = current_buffer;
2914           set_buffer_internal (old);
2915           del_range (beg, end);
2916           insert_from_buffer (new, 1, len, 0);
2917           end = beg + len;
2918         }
2919     }
2920
2921   /* We may be able to shrink the conversion region.  */
2922   begp = POS_ADDR (beg); endp = begp + (end - beg);
2923   shrink_conversion_area (&begp, &endp, coding, encodep);
2924
2925   if (begp == endp)
2926     /* We need no conversion.  */
2927     len = end - beg;
2928   else
2929     {
2930       beg += begp - POS_ADDR (beg);
2931       end =  beg + (endp - begp);
2932
2933       if (encodep)
2934         len = encoding_buffer_size (coding, end - beg);
2935       else
2936         len = decoding_buffer_size (coding, end - beg);
2937       buf = get_conversion_buffer (len);
2938
2939       coding->last_block = 1;
2940       produced = (encodep
2941                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
2942                                    &consumed)
2943                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
2944                                    &consumed));
2945
2946       len = produced + (beg - XINT (b)) + (XINT (e) - end);
2947
2948       TEMP_SET_PT (beg);
2949       insert (buf, produced);
2950       del_range (PT, PT + end - beg);
2951       if (pos >= end)
2952         pos = PT + (pos - end);
2953       else if (pos > beg)
2954         pos = beg;
2955       TEMP_SET_PT (pos);
2956   }
2957
2958   if (!encodep && !NILP (coding->post_read_conversion))
2959     {
2960       /* We must call a post-conversion function which may alter
2961          the text just converted.  */
2962       Lisp_Object insval;
2963
2964       beg = XINT (b);
2965       TEMP_SET_PT (beg);
2966       insval = call1 (coding->post_read_conversion, make_number (len));
2967       CHECK_NUMBER (insval, 0);
2968       len = XINT (insval);
2969     }
2970
2971   return make_number (len);
2972 }
2973
2974 Lisp_Object
2975 code_convert_string (str, coding, encodep)
2976      Lisp_Object str;
2977      struct coding_system *coding;
2978      int encodep;
2979 {
2980   int len, consumed, produced;
2981   char *buf;
2982   unsigned char *begp, *endp;
2983   int head_skip, tail_skip;
2984   struct gcpro gcpro1;
2985
2986   if (encodep && !NILP (coding->pre_write_conversion)
2987       || !encodep && !NILP (coding->post_read_conversion))
2988     {
2989       /* Since we have to call Lisp functions which assume target text
2990          is in a buffer, after setting a temporary buffer, call
2991          code_convert_region.  */
2992       int count = specpdl_ptr - specpdl;
2993       int len = XSTRING (str)->size;
2994       Lisp_Object result;
2995       struct buffer *old = current_buffer;
2996
2997       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
2998       temp_output_buffer_setup (" *code-converting-work*");
2999       set_buffer_internal (XBUFFER (Vstandard_output));
3000       insert_from_string (str, 0, len, 0);
3001       code_convert_region (make_number (BEGV), make_number (ZV),
3002                            coding, encodep);
3003       result = make_buffer_string (BEGV, ZV, 0);
3004       set_buffer_internal (old);
3005       return unbind_to (count, result);
3006     }
3007
3008   /* We may be able to shrink the conversion region.  */
3009   begp = XSTRING (str)->data;
3010   endp = begp + XSTRING (str)->size;
3011   shrink_conversion_area (&begp, &endp, coding, encodep);
3012
3013   if (begp == endp)
3014     /* We need no conversion.  */
3015     return str;
3016
3017   head_skip = begp - XSTRING (str)->data;
3018   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3019
3020   GCPRO1 (str);
3021
3022   if (encodep)
3023     len = encoding_buffer_size (coding, endp - begp);
3024   else
3025     len = decoding_buffer_size (coding, endp - begp);
3026   buf = get_conversion_buffer (len + head_skip + tail_skip);
3027
3028   bcopy (XSTRING (str)->data, buf, head_skip);
3029   coding->last_block = 1;
3030   produced = (encodep
3031               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3032                                buf + head_skip, endp - begp, len, &consumed)
3033               : decode_coding (coding, XSTRING (str)->data + head_skip,
3034                                buf + head_skip, endp - begp, len, &consumed));
3035   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3036          buf + head_skip + produced,
3037          tail_skip);
3038
3039   UNGCPRO;
3040
3041   return make_string (buf, head_skip + produced + tail_skip);
3042 }
3043
3044 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3045        3, 3, 0,
3046   "Decode the text between START and END which is encoded in CODING-SYSTEM.\n\
3047 Return length of decoded text.")
3048   (b, e, coding_system)
3049      Lisp_Object b, e, coding_system;
3050 {
3051   struct coding_system coding;
3052
3053   CHECK_NUMBER_COERCE_MARKER (b, 0);
3054   CHECK_NUMBER_COERCE_MARKER (e, 1);
3055   CHECK_SYMBOL (coding_system, 2);
3056
3057   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3058     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3059
3060   return code_convert_region (b, e, &coding, 0);
3061 }
3062
3063 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3064        3, 3, 0,
3065   "Encode the text between START and END to CODING-SYSTEM.\n\
3066 Return length of encoded text.")
3067   (b, e, coding_system)
3068      Lisp_Object b, e, coding_system;
3069 {
3070   struct coding_system coding;
3071
3072   CHECK_NUMBER_COERCE_MARKER (b, 0);
3073   CHECK_NUMBER_COERCE_MARKER (e, 1);
3074   CHECK_SYMBOL (coding_system, 2);
3075
3076   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3077     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3078
3079   return code_convert_region (b, e, &coding, 1);
3080 }
3081
3082 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3083        2, 2, 0,
3084   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.")
3085   (string, coding_system)
3086      Lisp_Object string, coding_system;
3087 {
3088   struct coding_system coding;
3089
3090   CHECK_STRING (string, 0);
3091   CHECK_SYMBOL (coding_system, 1);
3092
3093   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3094     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3095
3096   return code_convert_string (string, &coding, 0);
3097 }
3098
3099 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3100        2, 2, 0,
3101   "Encode STRING to CODING-SYSTEM, and return the result.")
3102   (string, coding_system)
3103      Lisp_Object string, coding_system;
3104 {
3105   struct coding_system coding;
3106
3107   CHECK_STRING (string, 0);
3108   CHECK_SYMBOL (coding_system, 1);
3109
3110   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3111     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3112
3113   return code_convert_string (string, &coding, 1);
3114 }
3115
3116 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3117   "Decode a JISX0208 character of SJIS coding-system-sjis.\n\
3118 CODE is the character code in SJIS.\n\
3119 Return the corresponding character.")
3120   (code)
3121      Lisp_Object code;
3122 {
3123   unsigned char c1, c2, s1, s2;
3124   Lisp_Object val;
3125
3126   CHECK_NUMBER (code, 0);
3127   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3128   DECODE_SJIS (s1, s2, c1, c2);
3129   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3130   return val;
3131 }
3132
3133 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3134   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3135 Return the corresponding character code in SJIS.")
3136   (ch)
3137      Lisp_Object ch;
3138 {
3139   int charset;
3140   unsigned char c1, c2, s1, s2;
3141   Lisp_Object val;
3142
3143   CHECK_NUMBER (ch, 0);
3144   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3145   if (charset == charset_jisx0208)
3146     {
3147       ENCODE_SJIS (c1, c2, s1, s2);
3148       XSETFASTINT (val, ((int)s1 << 8) | s2);
3149     }
3150   else
3151     XSETFASTINT (val, 0);
3152   return val;
3153 }
3154
3155 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3156   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3157 CODE is the character code in BIG5.\n\
3158 Return the corresponding character.")
3159   (code)
3160      Lisp_Object code;
3161 {
3162   int charset;
3163   unsigned char b1, b2, c1, c2;
3164   Lisp_Object val;
3165
3166   CHECK_NUMBER (code, 0);
3167   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3168   DECODE_BIG5 (b1, b2, charset, c1, c2);
3169   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3170   return val;
3171 }
3172
3173 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3174   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3175 Return the corresponding character code in Big5.")
3176   (ch)
3177      Lisp_Object ch;
3178 {
3179   int charset;
3180   unsigned char c1, c2, b1, b2;
3181   Lisp_Object val;
3182
3183   CHECK_NUMBER (ch, 0);
3184   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3185   if (charset == charset_big5_1 || charset == charset_big5_2)
3186     {
3187       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3188       XSETFASTINT (val, ((int)b1 << 8) | b2);
3189     }
3190   else
3191     XSETFASTINT (val, 0);
3192   return val;
3193 }
3194
3195 DEFUN ("set-terminal-coding-system",
3196        Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3197        "zCoding-system for terminal display: ",
3198   "Set coding-system of your terminal to CODING-SYSTEM.\n\
3199 All outputs to terminal are encoded to this coding-system.")
3200   (coding_system)
3201      Lisp_Object coding_system;
3202 {
3203   CHECK_SYMBOL (coding_system, 0);
3204   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3205   update_mode_lines++;
3206   if (!NILP (Finteractive_p ()))
3207     Fredraw_display ();
3208   return Qnil;
3209 }
3210
3211 DEFUN ("terminal-coding-system",
3212        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3213   "Return coding-system of your terminal.")
3214   ()
3215 {
3216   return terminal_coding.symbol;
3217 }
3218
3219 DEFUN ("set-keyboard-coding-system",
3220        Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1,
3221        "zCoding-system for keyboard input: ",
3222   "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
3223 All inputs from terminal are decoded from this coding-system.")
3224   (coding_system)
3225      Lisp_Object coding_system;
3226 {
3227   CHECK_SYMBOL (coding_system, 0);
3228   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3229   return Qnil;
3230 }
3231
3232 DEFUN ("keyboard-coding-system",
3233        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3234   "Return coding-system of what is sent from terminal keyboard.")
3235   ()
3236 {
3237   return keyboard_coding.symbol;
3238 }
3239
3240 \f
3241 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3242        1, MANY, 0,
3243   "Return a cons of coding systems for I/O primitive OPERATION.\n\
3244 Remaining arguments are for OPERATION.\n\
3245 OPERATION is one of the following Emacs I/O primitives:\n\
3246   For file I/O, insert-file-contents or write-region.\n\
3247   For process I/O, call-process, call-process-region, or start-process.\n\
3248   For network I/O, open-network-stream.\n\
3249 For each OPERATION, TARGET is selected from the arguments as below:\n\
3250   For file I/O, TARGET is a file name.\n\
3251   For process I/O, TARGET is a process name.\n\
3252   For network I/O, TARGET is a service name or a port number\n\
3253 \n\
3254 The return value is a cons of coding systems for decoding and encoding\n\
3255 registered in nested alist `coding-system-alist' (which see) at a slot\n\
3256 corresponding to OPERATION and TARGET.
3257 If a function symbol is at the slot, return a result of the function call.\n\
3258 The function is called with one argument, a list of all the arguments.")
3259   (nargs, args)
3260      int nargs;
3261      Lisp_Object *args;
3262 {
3263   Lisp_Object operation, target_idx, target, val;
3264   register Lisp_Object chain;
3265
3266   if (nargs < 2)
3267     error ("Too few arguments");
3268   operation = args[0];
3269   if (!SYMBOLP (operation)
3270       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3271     error ("Invalid first arguement");
3272   if (nargs < 1 + XINT (target_idx))
3273     error ("Too few arguments for operation: %s",
3274            XSYMBOL (operation)->name->data);
3275   target = args[XINT (target_idx) + 1];
3276   if (!(STRINGP (target)
3277         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3278     error ("Invalid %dth argument", XINT (target_idx) + 1);
3279
3280   chain = Fassq (operation, Vcoding_system_alist);
3281   if (NILP (chain))
3282     return Qnil;
3283
3284   for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3285     {
3286       Lisp_Object elt = XCONS (chain)->car;
3287
3288       if (CONSP (elt)
3289           && ((STRINGP (target)
3290                && STRINGP (XCONS (elt)->car)
3291                && fast_string_match (XCONS (elt)->car, target) >= 0)
3292               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3293         return (CONSP (val = XCONS (elt)->cdr)
3294                 ? val
3295                 : ((SYMBOLP (val) && Fboundp (val)
3296                     ? call2 (val, Flist (nargs, args))
3297                     : Qnil)));
3298     }
3299   return Qnil;
3300 }
3301
3302 #endif /* emacs */
3303
3304 \f
3305 /*** 8. Post-amble ***/
3306
3307 init_coding_once ()
3308 {
3309   int i;
3310
3311   /* Emacs internal format specific initialize routine.  */
3312   for (i = 0; i <= 0x20; i++)
3313     emacs_code_class[i] = EMACS_control_code;
3314   emacs_code_class[0x0A] = EMACS_linefeed_code;
3315   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3316   for (i = 0x21 ; i < 0x7F; i++)
3317     emacs_code_class[i] = EMACS_ascii_code;
3318   emacs_code_class[0x7F] = EMACS_control_code;
3319   emacs_code_class[0x80] = EMACS_leading_code_composition;
3320   for (i = 0x81; i < 0xFF; i++)
3321     emacs_code_class[i] = EMACS_invalid_code;
3322   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3323   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3324   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3325   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3326
3327   /* ISO2022 specific initialize routine.  */
3328   for (i = 0; i < 0x20; i++)
3329     iso_code_class[i] = ISO_control_code;
3330   for (i = 0x21; i < 0x7F; i++)
3331     iso_code_class[i] = ISO_graphic_plane_0;
3332   for (i = 0x80; i < 0xA0; i++)
3333     iso_code_class[i] = ISO_control_code;
3334   for (i = 0xA1; i < 0xFF; i++)
3335     iso_code_class[i] = ISO_graphic_plane_1;
3336   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3337   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3338   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3339   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3340   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3341   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3342   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3343   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3344   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3345   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3346
3347   Qcoding_system = intern ("coding-system");
3348   staticpro (&Qcoding_system);
3349
3350   Qeol_type = intern ("eol-type");
3351   staticpro (&Qeol_type);
3352
3353   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3354   staticpro (&Qbuffer_file_coding_system);
3355
3356   Qpost_read_conversion = intern ("post-read-conversion");
3357   staticpro (&Qpost_read_conversion);
3358
3359   Qpre_write_conversion = intern ("pre-write-conversion");
3360   staticpro (&Qpre_write_conversion);
3361
3362   Qcoding_system_vector = intern ("coding-system-vector");
3363   staticpro (&Qcoding_system_vector);
3364
3365   Qcoding_system_p = intern ("coding-system-p");
3366   staticpro (&Qcoding_system_p);
3367
3368   Qcoding_system_error = intern ("coding-system-error");
3369   staticpro (&Qcoding_system_error);
3370
3371   Fput (Qcoding_system_error, Qerror_conditions,
3372         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3373   Fput (Qcoding_system_error, Qerror_message,
3374         build_string ("Coding-system error"));
3375
3376   Qcoding_category_index = intern ("coding-category-index");
3377   staticpro (&Qcoding_category_index);
3378
3379   {
3380     int i;
3381     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3382       {
3383         coding_category_table[i] = intern (coding_category_name[i]);
3384         staticpro (&coding_category_table[i]);
3385         Fput (coding_category_table[i], Qcoding_category_index,
3386               make_number (i));
3387       }
3388   }
3389
3390   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3391   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3392
3393   setup_coding_system (Qnil, &keyboard_coding);
3394   setup_coding_system (Qnil, &terminal_coding);
3395 }
3396
3397 #ifdef emacs
3398
3399 syms_of_coding ()
3400 {
3401   Qtarget_idx = intern ("target-idx");
3402   staticpro (&Qtarget_idx);
3403
3404   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3405   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3406
3407   Qcall_process = intern ("call-process");
3408   staticpro (&Qcall_process);
3409   Fput (Qcall_process, Qtarget_idx, make_number (0));
3410
3411   Qcall_process_region = intern ("call-process-region");
3412   staticpro (&Qcall_process_region);
3413   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3414
3415   Qstart_process = intern ("start-process");
3416   staticpro (&Qstart_process);
3417   Fput (Qstart_process, Qtarget_idx, make_number (2));
3418
3419   Qopen_network_stream = intern ("open-network-stream");
3420   staticpro (&Qopen_network_stream);
3421   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3422
3423   defsubr (&Scoding_system_vector);
3424   defsubr (&Scoding_system_p);
3425   defsubr (&Sread_coding_system);
3426   defsubr (&Sread_non_nil_coding_system);
3427   defsubr (&Scheck_coding_system);
3428   defsubr (&Sdetect_coding_region);
3429   defsubr (&Sdecode_coding_region);
3430   defsubr (&Sencode_coding_region);
3431   defsubr (&Sdecode_coding_string);
3432   defsubr (&Sencode_coding_string);
3433   defsubr (&Sdecode_sjis_char);
3434   defsubr (&Sencode_sjis_char);
3435   defsubr (&Sdecode_big5_char);
3436   defsubr (&Sencode_big5_char);
3437   defsubr (&Sset_terminal_coding_system);
3438   defsubr (&Sterminal_coding_system);
3439   defsubr (&Sset_keyboard_coding_system);
3440   defsubr (&Skeyboard_coding_system);
3441   defsubr (&Sfind_coding_system);
3442
3443   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3444     "List of coding-categories (symbols) ordered by priority.");
3445   {
3446     int i;
3447
3448     Vcoding_category_list = Qnil;
3449     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3450       Vcoding_category_list
3451         = Fcons (coding_category_table[i], Vcoding_category_list);
3452   }
3453
3454   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3455     "A variable of internal use only.\n\
3456 If the value is a coding system, it is used for decoding on read operation.\n\
3457 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3458   Vcoding_system_for_read = Qnil;
3459
3460   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3461     "A variable of internal use only.\n\
3462 If the value is a coding system, it is used for encoding on write operation.\n\
3463 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3464   Vcoding_system_for_write = Qnil;
3465
3466   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3467     "Coding-system used in the latest file or process I/O.");
3468   Vlast_coding_system_used = Qnil;
3469
3470   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3471     "Nested alist to decide a coding system for a specific I/O operation.\n\
3472 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
3473
3474 OPERATION is one of the following Emacs I/O primitives:\n\
3475   For file I/O, insert-file-contents and write-region.\n\
3476   For process I/O, call-process, call-process-region, and start-process.\n\
3477   For network I/O, open-network-stream.\n\
3478 In addition, for process I/O, `process-argument' can be specified for\n\
3479 encoding arguments of the process.\n\
3480 \n\
3481 REGEXP is a regular expression matching a target of OPERATION, where\n\
3482 target is a file name for file I/O operations, a process name for\n\
3483 process I/O operations, or a service name for network I/O\n\
3484 operations.  REGEXP might be a port number for network I/O operation.\n\
3485 \n\
3486 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3487 character code on OPERATION, or a function symbol returning the cons.\n\
3488 See the documentation of `find-coding-system' for more detail.");
3489   Vcoding_system_alist = Qnil;
3490
3491   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3492     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3493   eol_mnemonic_unix = '.';
3494
3495   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3496     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3497   eol_mnemonic_dos = ':';
3498
3499   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3500     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3501   eol_mnemonic_mac = '\'';
3502
3503   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3504     "Mnemonic character indicating end-of-line format is not yet decided.");
3505   eol_mnemonic_undecided = '-';
3506
3507   DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table,
3508     "Alist of charsets vs the alternate charsets.\n\
3509 While decoding, if a charset (car part of an element) is found,\n\
3510 decode it as the alternate charset (cdr part of the element).");
3511   Valternate_charset_table = Qnil;
3512
3513   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3514     "Alist of charsets vs revision numbers.\n\
3515 While encoding, if a charset (car part of an element) is found,\n\
3516 designate it with the escape sequence identifing revision (cdr part of the element).");
3517   Vcharset_revision_alist = Qnil;
3518 }
3519
3520 #endif /* emacs */