src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Raw text
  71
  72   A coding system for a text containing random 8-bit code.  Emacs does
  73   no code conversion on such a text except for end-of-line format.
  74
  75   5. Other
  76
  77   If a user wants to read/write a text encoded in a coding system not
  78   listed above, he can supply a decoder and an encoder for it in CCL
  79   (Code Conversion Language) programs.  Emacs executes the CCL program
  80   while reading/writing.
  81
  82   Emacs represents a coding system by a Lisp symbol that has a property
  83   `coding-system'.  But, before actually using the coding system, the
  84   information about it is set in a structure of type `struct
  85   coding_system' for rapid processing.  See section 6 for more details.
  86
  87 */
  88
  89 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  90
  91   How end-of-line of a text is encoded depends on a system.  For
  92   instance, Unix's format is just one byte of `line-feed' code,
  93   whereas DOS's format is two-byte sequence of `carriage-return' and
  94   `line-feed' codes.  MacOS's format is usually one byte of
  95   `carriage-return'.
  96
  97   Since text characters encoding and end-of-line encoding are
  98   independent, any coding system described above can take
  99   any format of end-of-line.  So, Emacs has information of format of
 100   end-of-line in each coding-system.  See section 6 for more details.
 101
 102 */
 103
 104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 105
 106   These functions check if a text between SRC and SRC_END is encoded
 107   in the coding system category XXX.  Each returns an integer value in
 108   which appropriate flag bits for the category XXX is set.  The flag
 109   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 110   template of these functions.  */
 111 #if 0
 112 int
 113 detect_coding_emacs_mule (src, src_end)
 114      unsigned char *src, *src_end;
 115 {
 116   ...
 117 }
 118 #endif
 119
 120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 121
 122   These functions decode SRC_BYTES length text at SOURCE encoded in
 123   CODING to Emacs' internal format (emacs-mule).  The resulting text
 124   goes to a place pointed to by DESTINATION, the length of which
 125   should not exceed DST_BYTES.  These functions set the information of
 126   original and decoded texts in the members produced, produced_char,
 127   consumed, and consumed_char of the structure *CODING.
 128
 129   The return value is an integer (CODING_FINISH_XXX) indicating how
 130   the decoding finished.
 131
 132   DST_BYTES zero means that source area and destination area are
 133   overlapped, which means that we can produce a decoded text until it
 134   reaches at the head of not-yet-decoded source text.
 135
 136   Below is a template of these functions.  */
 137 #if 0
 138 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 139      struct coding_system *coding;
 140      unsigned char *source, *destination;
 141      int src_bytes, dst_bytes;
 142 {
 143   ...
 144 }
 145 #endif
 146
 147 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 148
 149   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 150   internal format (emacs-mule) to CODING.  The resulting text goes to
 151   a place pointed to by DESTINATION, the length of which should not
 152   exceed DST_BYTES.  These functions set the information of
 153   original and encoded texts in the members produced, produced_char,
 154   consumed, and consumed_char of the structure *CODING.
 155
 156   The return value is an integer (CODING_FINISH_XXX) indicating how
 157   the encoding finished.
 158
 159   DST_BYTES zero means that source area and destination area are
 160   overlapped, which means that we can produce a decoded text until it
 161   reaches at the head of not-yet-decoded source text.
 162
 163   Below is a template of these functions.  */
 164 #if 0
 165 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 166      struct coding_system *coding;
 167      unsigned char *source, *destination;
 168      int src_bytes, dst_bytes;
 169 {
 170   ...
 171 }
 172 #endif
 173
 174 /*** COMMONLY USED MACROS ***/
 175
 176 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 177    THREE_MORE_BYTES safely get one, two, and three bytes from the
 178    source text respectively.  If there are not enough bytes in the
 179    source, they jump to `label_end_of_loop'.  The caller should set
 180    variables `src' and `src_end' to appropriate areas in advance.  */
 181
 182 #define ONE_MORE_BYTE(c1)       \
 183   do {                          \
 184     if (src < src_end)          \
 185       c1 = *src++;              \
 186     else                        \
 187       goto label_end_of_loop;   \
 188   } while (0)
 189
 190 #define TWO_MORE_BYTES(c1, c2)  \
 191   do {                          \
 192     if (src + 1 < src_end)      \
 193       c1 = *src++, c2 = *src++; \
 194     else                        \
 195       goto label_end_of_loop;   \
 196   } while (0)
 197
 198 #define THREE_MORE_BYTES(c1, c2, c3)            \
 199   do {                                          \
 200     if (src + 2 < src_end)                      \
 201       c1 = *src++, c2 = *src++, c3 = *src++;    \
 202     else                                        \
 203       goto label_end_of_loop;                   \
 204   } while (0)
 205
 206 /* The following three macros DECODE_CHARACTER_ASCII,
 207    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 208    the multi-byte form of a character of each class at the place
 209    pointed by `dst'.  The caller should set the variable `dst' to
 210    point to an appropriate area and the variable `coding' to point to
 211    the coding-system of the currently decoding text in advance.  */
 212
 213 /* Decode one ASCII character C.  */
 214
 215 #define DECODE_CHARACTER_ASCII(c)                               \
 216   do {                                                          \
 217     if (COMPOSING_P (coding->composing))                        \
 218       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 219     else                                                        \
 220       {                                                         \
 221         *dst++ = (c);                                           \
 222         coding->produced_char++;                                \
 223       }                                                         \
 224   } while (0)
 225
 226 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 227    position-code is C.  */
 228
 229 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 230   do {                                                                  \
 231     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 232     if (COMPOSING_P (coding->composing))                                \
 233       *dst++ = leading_code + 0x20;                                     \
 234     else                                                                \
 235       {                                                                 \
 236         *dst++ = leading_code;                                          \
 237         coding->produced_char++;                                        \
 238       }                                                                 \
 239     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 240       *dst++ = leading_code;                                            \
 241     *dst++ = (c) | 0x80;                                                \
 242   } while (0)
 243
 244 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 245    position-codes are C1 and C2.  */
 246
 247 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 248   do {                                                  \
 249     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 250     *dst++ = (c2) | 0x80;                               \
 251   } while (0)
 252
 253 \f
 254 /*** 1. Preamble ***/
 255
 256 #include <stdio.h>
 257
 258 #ifdef emacs
 259
 260 #include <config.h>
 261 #include "lisp.h"
 262 #include "buffer.h"
 263 #include "charset.h"
 264 #include "ccl.h"
 265 #include "coding.h"
 266 #include "window.h"
 267
 268 #else  /* not emacs */
 269
 270 #include "mulelib.h"
 271
 272 #endif /* not emacs */
 273
 274 Lisp_Object Qcoding_system, Qeol_type;
 275 Lisp_Object Qbuffer_file_coding_system;
 276 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 277 Lisp_Object Qno_conversion, Qundecided;
 278 Lisp_Object Qcoding_system_history;
 279 Lisp_Object Qsafe_charsets;
 280
 281 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 282 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 283 Lisp_Object Qstart_process, Qopen_network_stream;
 284 Lisp_Object Qtarget_idx;
 285
 286 Lisp_Object Vselect_safe_coding_system_function;
 287
 288 /* Mnemonic character of each format of end-of-line.  */
 289 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 290 /* Mnemonic character to indicate format of end-of-line is not yet
 291    decided.  */
 292 int eol_mnemonic_undecided;
 293
 294 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 295    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 296 int system_eol_type;
 297
 298 #ifdef emacs
 299
 300 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 301
 302 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 303
 304 /* Coding system emacs-mule and raw-text are for converting only
 305    end-of-line format.  */
 306 Lisp_Object Qemacs_mule, Qraw_text;
 307
 308 /* Coding-systems are handed between Emacs Lisp programs and C internal
 309    routines by the following three variables.  */
 310 /* Coding-system for reading files and receiving data from process.  */
 311 Lisp_Object Vcoding_system_for_read;
 312 /* Coding-system for writing files and sending data to process.  */
 313 Lisp_Object Vcoding_system_for_write;
 314 /* Coding-system actually used in the latest I/O.  */
 315 Lisp_Object Vlast_coding_system_used;
 316
 317 /* A vector of length 256 which contains information about special
 318    Latin codes (espepcially for dealing with Microsoft code).  */
 319 Lisp_Object Vlatin_extra_code_table;
 320
 321 /* Flag to inhibit code conversion of end-of-line format.  */
 322 int inhibit_eol_conversion;
 323
 324 /* Coding system to be used to encode text for terminal display.  */
 325 struct coding_system terminal_coding;
 326
 327 /* Coding system to be used to encode text for terminal display when
 328    terminal coding system is nil.  */
 329 struct coding_system safe_terminal_coding;
 330
 331 /* Coding system of what is sent from terminal keyboard.  */
 332 struct coding_system keyboard_coding;
 333
 334 Lisp_Object Vfile_coding_system_alist;
 335 Lisp_Object Vprocess_coding_system_alist;
 336 Lisp_Object Vnetwork_coding_system_alist;
 337
 338 #endif /* emacs */
 339
 340 Lisp_Object Qcoding_category, Qcoding_category_index;
 341
 342 /* List of symbols `coding-category-xxx' ordered by priority.  */
 343 Lisp_Object Vcoding_category_list;
 344
 345 /* Table of coding categories (Lisp symbols).  */
 346 Lisp_Object Vcoding_category_table;
 347
 348 /* Table of names of symbol for each coding-category.  */
 349 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 350   "coding-category-emacs-mule",
 351   "coding-category-sjis",
 352   "coding-category-iso-7",
 353   "coding-category-iso-7-tight",
 354   "coding-category-iso-8-1",
 355   "coding-category-iso-8-2",
 356   "coding-category-iso-7-else",
 357   "coding-category-iso-8-else",
 358   "coding-category-big5",
 359   "coding-category-raw-text",
 360   "coding-category-binary"
 361 };
 362
 363 /* Table pointers to coding systems corresponding to each coding
 364    categories.  */
 365 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 366
 367 /* Flag to tell if we look up unification table on character code
 368    conversion.  */
 369 Lisp_Object Venable_character_unification;
 370 /* Standard unification table to look up on decoding (reading).  */
 371 Lisp_Object Vstandard_character_unification_table_for_decode;
 372 /* Standard unification table to look up on encoding (writing).  */
 373 Lisp_Object Vstandard_character_unification_table_for_encode;
 374
 375 Lisp_Object Qcharacter_unification_table;
 376 Lisp_Object Qcharacter_unification_table_for_decode;
 377 Lisp_Object Qcharacter_unification_table_for_encode;
 378
 379 /* Alist of charsets vs revision number.  */
 380 Lisp_Object Vcharset_revision_alist;
 381
 382 /* Default coding systems used for process I/O.  */
 383 Lisp_Object Vdefault_process_coding_system;
 384
 385 \f
 386 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 387
 388 /* Emacs' internal format for encoding multiple character sets is a
 389    kind of multi-byte encoding, i.e. characters are encoded by
 390    variable-length sequences of one-byte codes.  ASCII characters
 391    and control characters (e.g. `tab', `newline') are represented by
 392    one-byte sequences which are their ASCII codes, in the range 0x00
 393    through 0x7F.  The other characters are represented by a sequence
 394    of `base leading-code', optional `extended leading-code', and one
 395    or two `position-code's.  The length of the sequence is determined
 396    by the base leading-code.  Leading-code takes the range 0x80
 397    through 0x9F, whereas extended leading-code and position-code take
 398    the range 0xA0 through 0xFF.  See `charset.h' for more details
 399    about leading-code and position-code.
 400
 401    There's one exception to this rule.  Special leading-code
 402    `leading-code-composition' denotes that the following several
 403    characters should be composed into one character.  Leading-codes of
 404    components (except for ASCII) are added 0x20.  An ASCII character
 405    component is represented by a 2-byte sequence of `0xA0' and
 406    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 407    details of composite character.  Hence, we can summarize the code
 408    range as follows:
 409
 410    --- CODE RANGE of Emacs' internal format ---
 411    (character set)      (range)
 412    ASCII                0x00 .. 0x7F
 413    ELSE (1st byte)      0x80 .. 0x9F
 414         (rest bytes)    0xA0 .. 0xFF
 415    ---------------------------------------------
 416
 417   */
 418
 419 enum emacs_code_class_type emacs_code_class[256];
 420
 421 /* Go to the next statement only if *SRC is accessible and the code is
 422    greater than 0xA0.  */
 423 #define CHECK_CODE_RANGE_A0_FF  \
 424   do {                          \
 425     if (src >= src_end)         \
 426       goto label_end_of_switch; \
 427     else if (*src++ < 0xA0)     \
 428       return 0;                 \
 429   } while (0)
 430
 431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 432    Check if a text is encoded in Emacs' internal format.  If it is,
 433    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 434
 435 int
 436 detect_coding_emacs_mule (src, src_end)
 437      unsigned char *src, *src_end;
 438 {
 439   unsigned char c;
 440   int composing = 0;
 441
 442   while (src < src_end)
 443     {
 444       c = *src++;
 445
 446       if (composing)
 447         {
 448           if (c < 0xA0)
 449             composing = 0;
 450           else
 451             c -= 0x20;
 452         }
 453
 454       switch (emacs_code_class[c])
 455         {
 456         case EMACS_ascii_code:
 457         case EMACS_linefeed_code:
 458           break;
 459
 460         case EMACS_control_code:
 461           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 462             return 0;
 463           break;
 464
 465         case EMACS_invalid_code:
 466           return 0;
 467
 468         case EMACS_leading_code_composition: /* c == 0x80 */
 469           if (composing)
 470             CHECK_CODE_RANGE_A0_FF;
 471           else
 472             composing = 1;
 473           break;
 474
 475         case EMACS_leading_code_4:
 476           CHECK_CODE_RANGE_A0_FF;
 477           /* fall down to check it two more times ...  */
 478
 479         case EMACS_leading_code_3:
 480           CHECK_CODE_RANGE_A0_FF;
 481           /* fall down to check it one more time ...  */
 482
 483         case EMACS_leading_code_2:
 484           CHECK_CODE_RANGE_A0_FF;
 485           break;
 486
 487         default:
 488         label_end_of_switch:
 489           break;
 490         }
 491     }
 492   return CODING_CATEGORY_MASK_EMACS_MULE;
 493 }
 494
 495 \f
 496 /*** 3. ISO2022 handlers ***/
 497
 498 /* The following note describes the coding system ISO2022 briefly.
 499    Since the intention of this note is to help in understanding of
 500    the programs in this file, some parts are NOT ACCURATE or OVERLY
 501    SIMPLIFIED.  For the thorough understanding, please refer to the
 502    original document of ISO2022.
 503
 504    ISO2022 provides many mechanisms to encode several character sets
 505    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 506    all text is encoded by codes of less than 128.  This may make the
 507    encoded text a little bit longer, but the text gets more stability
 508    to pass through several gateways (some of them strip off the MSB).
 509
 510    There are two kinds of character set: control character set and
 511    graphic character set.  The former contains control characters such
 512    as `newline' and `escape' to provide control functions (control
 513    functions are provided also by escape sequences).  The latter
 514    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 515    two control character sets and many graphic character sets.
 516
 517    Graphic character sets are classified into one of the following
 518    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 519    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 520    bytes (DIMENSION) and the number of characters in one dimension
 521    (CHARS) of the set.  In addition, each character set is assigned an
 522    identification tag (called "final character" and denoted as <F>
 523    here after) which is unique in each class.  <F> of each character
 524    set is decided by ECMA(*) when it is registered in ISO.  Code range
 525    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 526
 527    Note (*): ECMA = European Computer Manufacturers Association
 528
 529    Here are examples of graphic character set [NAME(<F>)]:
 530         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 531         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 532         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 533         o DIMENSION2_CHARS96 -- none for the moment
 534
 535    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 536         C0 [0x00..0x1F] -- control character plane 0
 537         GL [0x20..0x7F] -- graphic character plane 0
 538         C1 [0x80..0x9F] -- control character plane 1
 539         GR [0xA0..0xFF] -- graphic character plane 1
 540
 541    A control character set is directly designated and invoked to C0 or
 542    C1 by an escape sequence.  The most common case is that ISO646's
 543    control character set is designated/invoked to C0 and ISO6429's
 544    control character set is designated/invoked to C1, and usually
 545    these designations/invocations are omitted in a coded text.  With
 546    7-bit environment, only C0 can be used, and a control character for
 547    C1 is encoded by an appropriate escape sequence to fit in the
 548    environment.  All control characters for C1 are defined the
 549    corresponding escape sequences.
 550
 551    A graphic character set is at first designated to one of four
 552    graphic registers (G0 through G3), then these graphic registers are
 553    invoked to GL or GR.  These designations and invocations can be
 554    done independently.  The most common case is that G0 is invoked to
 555    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 556    these invocations and designations are omitted in a coded text.
 557    With 7-bit environment, only GL can be used.
 558
 559    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 560    and 0x7F of GL area work as control characters SPACE and DEL
 561    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 562
 563    There are two ways of invocation: locking-shift and single-shift.
 564    With locking-shift, the invocation lasts until the next different
 565    invocation, whereas with single-shift, the invocation works only
 566    for the following character and doesn't affect locking-shift.
 567    Invocations are done by the following control characters or escape
 568    sequences.
 569
 570    ----------------------------------------------------------------------
 571    function             control char    escape sequence description
 572    ----------------------------------------------------------------------
 573    SI  (shift-in)               0x0F    none            invoke G0 to GL
 574    SO  (shift-out)              0x0E    none            invoke G1 to GL
 575    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 576    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 577    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 578    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 579    ----------------------------------------------------------------------
 580    The first four are for locking-shift.  Control characters for these
 581    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 582
 583    Designations are done by the following escape sequences.
 584    ----------------------------------------------------------------------
 585    escape sequence      description
 586    ----------------------------------------------------------------------
 587    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 588    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 589    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 590    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 591    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 592    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 593    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 594    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 595    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 596    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 597    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 598    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 599    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 600    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 601    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 602    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 603    ----------------------------------------------------------------------
 604
 605    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 606    of dimension 1, chars 94, and final character <F>, and etc.
 607
 608    Note (*): Although these designations are not allowed in ISO2022,
 609    Emacs accepts them on decoding, and produces them on encoding
 610    CHARS96 character set in a coding system which is characterized as
 611    7-bit environment, non-locking-shift, and non-single-shift.
 612
 613    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 614    '(' can be omitted.  We call this as "short-form" here after.
 615
 616    Now you may notice that there are a lot of ways for encoding the
 617    same multilingual text in ISO2022.  Actually, there exists many
 618    coding systems such as Compound Text (used in X's inter client
 619    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 620    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 621    localized platforms), and all of these are variants of ISO2022.
 622
 623    In addition to the above, Emacs handles two more kinds of escape
 624    sequences: ISO6429's direction specification and Emacs' private
 625    sequence for specifying character composition.
 626
 627    ISO6429's direction specification takes the following format:
 628         o CSI ']'      -- end of the current direction
 629         o CSI '0' ']'  -- end of the current direction
 630         o CSI '1' ']'  -- start of left-to-right text
 631         o CSI '2' ']'  -- start of right-to-left text
 632    The control character CSI (0x9B: control sequence introducer) is
 633    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 634
 635    Character composition specification takes the following format:
 636         o ESC '0' -- start character composition
 637         o ESC '1' -- end character composition
 638    Since these are not standard escape sequences of any ISO, the use
 639    of them for these meaning is restricted to Emacs only.  */
 640
 641 enum iso_code_class_type iso_code_class[256];
 642
 643 #define CHARSET_OK(idx, charset)                \
 644   (CODING_SPEC_ISO_REQUESTED_DESIGNATION        \
 645    (coding_system_table[idx], charset)          \
 646    != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
 647
 648 #define SHIFT_OUT_OK(idx) \
 649   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 650
 651 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 652    Check if a text is encoded in ISO2022.  If it is, returns an
 653    integer in which appropriate flag bits any of:
 654         CODING_CATEGORY_MASK_ISO_7
 655         CODING_CATEGORY_MASK_ISO_7_TIGHT
 656         CODING_CATEGORY_MASK_ISO_8_1
 657         CODING_CATEGORY_MASK_ISO_8_2
 658         CODING_CATEGORY_MASK_ISO_7_ELSE
 659         CODING_CATEGORY_MASK_ISO_8_ELSE
 660    are set.  If a code which should never appear in ISO2022 is found,
 661    returns 0.  */
 662
 663 int
 664 detect_coding_iso2022 (src, src_end)
 665      unsigned char *src, *src_end;
 666 {
 667   int mask = CODING_CATEGORY_MASK_ISO;
 668   int mask_found = 0;
 669   int reg[4], shift_out = 0;
 670   int c, c1, i, charset;
 671
 672   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 673   while (mask && src < src_end)
 674     {
 675       c = *src++;
 676       switch (c)
 677         {
 678         case ISO_CODE_ESC:
 679           if (src >= src_end)
 680             break;
 681           c = *src++;
 682           if (c >= '(' && c <= '/')
 683             {
 684               /* Designation sequence for a charset of dimension 1.  */
 685               if (src >= src_end)
 686                 break;
 687               c1 = *src++;
 688               if (c1 < ' ' || c1 >= 0x80
 689                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 690                 /* Invalid designation sequence.  Just ignore.  */
 691                 break;
 692               reg[(c - '(') % 4] = charset;
 693             }
 694           else if (c == '$')
 695             {
 696               /* Designation sequence for a charset of dimension 2.  */
 697               if (src >= src_end)
 698                 break;
 699               c = *src++;
 700               if (c >= '@' && c <= 'B')
 701                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 702                 reg[0] = charset = iso_charset_table[1][0][c];
 703               else if (c >= '(' && c <= '/')
 704                 {
 705                   if (src >= src_end)
 706                     break;
 707                   c1 = *src++;
 708                   if (c1 < ' ' || c1 >= 0x80
 709                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 710                     /* Invalid designation sequence.  Just ignore.  */
 711                     break;
 712                   reg[(c - '(') % 4] = charset;
 713                 }
 714               else
 715                 /* Invalid designation sequence.  Just ignore.  */
 716                 break;
 717             }
 718           else if (c == 'N' || c == 'n')
 719             {
 720               if (shift_out == 0
 721                   && (reg[1] >= 0
 722                       || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 723                       || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 724                 {
 725                   /* Locking shift out.  */
 726                   mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 727                   mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 728                   shift_out = 1;
 729                 }
 730               break;
 731             }
 732           else if (c == 'O' || c == 'o')
 733             {
 734               if (shift_out == 1)
 735                 {
 736                   /* Locking shift in.  */
 737                   mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 738                   mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 739                   shift_out = 0;
 740                 }
 741               break;
 742             }
 743           else if (c == '0' || c == '1' || c == '2')
 744             /* Start/end composition.  Just ignore.  */
 745             break;
 746           else
 747             /* Invalid escape sequence.  Just ignore.  */
 748             break;
 749
 750           /* We found a valid designation sequence for CHARSET.  */
 751           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 752           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 753             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 754           else
 755             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 756           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 757             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 758           else
 759             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 760           if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 761             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 762           if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 763             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 764           break;
 765
 766         case ISO_CODE_SO:
 767           if (shift_out == 0
 768               && (reg[1] >= 0
 769                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 770                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 771             {
 772               /* Locking shift out.  */
 773               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 774               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 775             }
 776           break;
 777
 778         case ISO_CODE_SI:
 779           if (shift_out == 1)
 780             {
 781               /* Locking shift in.  */
 782               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 783               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 784             }
 785           break;
 786
 787         case ISO_CODE_CSI:
 788         case ISO_CODE_SS2:
 789         case ISO_CODE_SS3:
 790           {
 791             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 792
 793             if (c != ISO_CODE_CSI)
 794               {
 795                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 796                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 797                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 798                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 799                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 800                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 801               }
 802             if (VECTORP (Vlatin_extra_code_table)
 803                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 804               {
 805                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 806                     & CODING_FLAG_ISO_LATIN_EXTRA)
 807                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 808                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 809                     & CODING_FLAG_ISO_LATIN_EXTRA)
 810                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 811               }
 812             mask &= newmask;
 813             mask_found |= newmask;
 814           }
 815           break;
 816
 817         default:
 818           if (c < 0x80)
 819             break;
 820           else if (c < 0xA0)
 821             {
 822               if (VECTORP (Vlatin_extra_code_table)
 823                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 824                 {
 825                   int newmask = 0;
 826
 827                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 828                       & CODING_FLAG_ISO_LATIN_EXTRA)
 829                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 830                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 831                       & CODING_FLAG_ISO_LATIN_EXTRA)
 832                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 833                   mask &= newmask;
 834                   mask_found |= newmask;
 835                 }
 836               else
 837                 return 0;
 838             }
 839           else
 840             {
 841               unsigned char *src_begin = src;
 842
 843               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 844                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 845               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 846               while (src < src_end && *src >= 0xA0)
 847                 src++;
 848               if ((src - src_begin - 1) & 1 && src < src_end)
 849                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 850               else
 851                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
 852             }
 853           break;
 854         }
 855     }
 856
 857   return (mask & mask_found);
 858 }
 859
 860 /* Decode a character of which charset is CHARSET and the 1st position
 861    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 862    fetched from SRC and set to C2.  If CHARSET is negative, it means
 863    that we are decoding ill formed text, and what we can do is just to
 864    read C1 as is.  */
 865
 866 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 867   do {                                                                  \
 868     int c_alt, charset_alt = (charset);                                 \
 869     if (COMPOSING_HEAD_P (coding->composing))                           \
 870       {                                                                 \
 871         *dst++ = LEADING_CODE_COMPOSITION;                              \
 872         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 873           /* To tell composition rules are embeded.  */                 \
 874           *dst++ = 0xFF;                                                \
 875         coding->composing += 2;                                         \
 876       }                                                                 \
 877     if ((charset) >= 0)                                                 \
 878       {                                                                 \
 879         if (CHARSET_DIMENSION (charset) == 2)                           \
 880           {                                                             \
 881             ONE_MORE_BYTE (c2);                                         \
 882             if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F         \
 883                 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0)  \
 884               {                                                         \
 885                 src--;                                                  \
 886                 c2 = ' ';                                               \
 887               }                                                         \
 888           }                                                             \
 889         if (!NILP (unification_table)                                   \
 890             && ((c_alt = unify_char (unification_table,                 \
 891                                      -1, (charset), c1, c2)) >= 0))     \
 892           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 893       }                                                                 \
 894     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 895       DECODE_CHARACTER_ASCII (c1);                                      \
 896     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 897       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 898     else                                                                \
 899       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 900     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 901       /* To tell a composition rule follows.  */                        \
 902       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 903   } while (0)
 904
 905 /* Set designation state into CODING.  */
 906 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
 907   do {                                                                     \
 908     int charset = ISO_CHARSET_TABLE (make_number (dimension),              \
 909                                      make_number (chars),                  \
 910                                      make_number (final_char));            \
 911     if (charset >= 0                                                       \
 912         && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg) \
 913       {                                                                    \
 914         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
 915             && reg == 0                                                    \
 916             && charset == CHARSET_ASCII)                                   \
 917           {                                                                \
 918             /* We should insert this designation sequence as is so         \
 919                that it is surely written back to a file.  */               \
 920             coding->spec.iso2022.last_invalid_designation_register = -1;   \
 921             goto label_invalid_code;                                       \
 922           }                                                                \
 923         coding->spec.iso2022.last_invalid_designation_register = -1;       \
 924         if ((coding->mode & CODING_MODE_DIRECTION)                         \
 925             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
 926           charset = CHARSET_REVERSE_CHARSET (charset);                     \
 927         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
 928       }                                                                    \
 929     else                                                                   \
 930       {                                                                    \
 931         coding->spec.iso2022.last_invalid_designation_register = reg;      \
 932         goto label_invalid_code;                                           \
 933       }                                                                    \
 934   } while (0)
 935
 936 /* Check if the current composing sequence contains only valid codes.
 937    If the composing sequence doesn't end before SRC_END, return -1.
 938    Else, if it contains only valid codes, return 0.
 939    Else return the length of the composing sequence.  */
 940
 941 int check_composing_code (coding, src, src_end)
 942      struct coding_system *coding;
 943      unsigned char *src, *src_end;
 944 {
 945   unsigned char *src_start = src;
 946   int invalid_code_found = 0;
 947   int charset, c, c1, dim;
 948
 949   while (src < src_end)
 950     {
 951       if (*src++ != ISO_CODE_ESC) continue;
 952       if (src >= src_end) break;
 953       if ((c = *src++) == '1') /* end of compsition */
 954         return (invalid_code_found ? src - src_start : 0);
 955       if (src + 2 >= src_end) break;
 956       if (!coding->flags & CODING_FLAG_ISO_DESIGNATION)
 957         invalid_code_found = 1;
 958       else
 959         {
 960           dim = 0;
 961           if (c == '$')
 962             {
 963               dim = 1;
 964               c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
 965             }
 966           if (c >= '(' && c <= '/')
 967             {
 968               c1 = *src++;
 969               if ((c1 < ' ' || c1 >= 0x80)
 970                   || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
 971                   || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
 972                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
 973                 invalid_code_found = 1;
 974             }
 975           else
 976             invalid_code_found = 1;
 977         }
 978     }
 979   return ((coding->mode & CODING_MODE_LAST_BLOCK) ? src_end - src_start : -1);
 980 }
 981
 982 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 983
 984 int
 985 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
 986      struct coding_system *coding;
 987      unsigned char *source, *destination;
 988      int src_bytes, dst_bytes;
 989 {
 990   unsigned char *src = source;
 991   unsigned char *src_end = source + src_bytes;
 992   unsigned char *dst = destination;
 993   unsigned char *dst_end = destination + dst_bytes;
 994   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 995      from DST_END to assure that overflow checking is necessary only
 996      at the head of loop.  */
 997   unsigned char *adjusted_dst_end = dst_end - 6;
 998   int charset;
 999   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1000   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1001   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1002   Lisp_Object unification_table
1003     = coding->character_unification_table_for_decode;
1004   int result = CODING_FINISH_NORMAL;
1005
1006   if (!NILP (Venable_character_unification) && NILP (unification_table))
1007     unification_table = Vstandard_character_unification_table_for_decode;
1008
1009   coding->produced_char = 0;
1010   coding->fake_multibyte = 0;
1011   while (src < src_end && (dst_bytes
1012                            ? (dst < adjusted_dst_end)
1013                            : (dst < src - 6)))
1014     {
1015       /* SRC_BASE remembers the start position in source in each loop.
1016          The loop will be exited when there's not enough source text
1017          to analyze long escape sequence or 2-byte code (within macros
1018          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
1019          to SRC_BASE before exiting.  */
1020       unsigned char *src_base = src;
1021       int c1 = *src++, c2;
1022
1023       switch (iso_code_class [c1])
1024         {
1025         case ISO_0x20_or_0x7F:
1026           if (!coding->composing
1027               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1028             {
1029               /* This is SPACE or DEL.  */
1030               *dst++ = c1;
1031               coding->produced_char++;
1032               break;
1033             }
1034           /* This is a graphic character, we fall down ...  */
1035
1036         case ISO_graphic_plane_0:
1037           if (coding->composing == COMPOSING_WITH_RULE_RULE)
1038             {
1039               /* This is a composition rule.  */
1040               *dst++ = c1 | 0x80;
1041               coding->composing = COMPOSING_WITH_RULE_TAIL;
1042             }
1043           else
1044             DECODE_ISO_CHARACTER (charset0, c1);
1045           break;
1046
1047         case ISO_0xA0_or_0xFF:
1048           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1049               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1050             goto label_invalid_code;
1051           /* This is a graphic character, we fall down ... */
1052
1053         case ISO_graphic_plane_1:
1054           if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1055             goto label_invalid_code;
1056           else
1057             DECODE_ISO_CHARACTER (charset1, c1);
1058           break;
1059
1060         case ISO_control_code:
1061           /* All ISO2022 control characters in this class have the
1062              same representation in Emacs internal format.  */
1063           if (c1 == '\n'
1064               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1065               && (coding->eol_type == CODING_EOL_CR
1066                   || coding->eol_type == CODING_EOL_CRLF))
1067             {
1068               result = CODING_FINISH_INCONSISTENT_EOL;
1069               goto label_end_of_loop_2;
1070             }
1071           *dst++ = c1;
1072           coding->produced_char++;
1073           break;
1074
1075         case ISO_carriage_return:
1076           if (coding->eol_type == CODING_EOL_CR)
1077             *dst++ = '\n';
1078           else if (coding->eol_type == CODING_EOL_CRLF)
1079             {
1080               ONE_MORE_BYTE (c1);
1081               if (c1 == ISO_CODE_LF)
1082                 *dst++ = '\n';
1083               else
1084                 {
1085                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1086                     {
1087                       result = CODING_FINISH_INCONSISTENT_EOL;
1088                       goto label_end_of_loop_2;
1089                     }
1090                   src--;
1091                   *dst++ = '\r';
1092                 }
1093             }
1094           else
1095             *dst++ = c1;
1096           coding->produced_char++;
1097           break;
1098
1099         case ISO_shift_out:
1100           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1101               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1102             goto label_invalid_code;
1103           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1104           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1105           break;
1106
1107         case ISO_shift_in:
1108           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1109             goto label_invalid_code;
1110           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1111           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1112           break;
1113
1114         case ISO_single_shift_2_7:
1115         case ISO_single_shift_2:
1116           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1117             goto label_invalid_code;
1118           /* SS2 is handled as an escape sequence of ESC 'N' */
1119           c1 = 'N';
1120           goto label_escape_sequence;
1121
1122         case ISO_single_shift_3:
1123           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1124             goto label_invalid_code;
1125           /* SS2 is handled as an escape sequence of ESC 'O' */
1126           c1 = 'O';
1127           goto label_escape_sequence;
1128
1129         case ISO_control_sequence_introducer:
1130           /* CSI is handled as an escape sequence of ESC '[' ...  */
1131           c1 = '[';
1132           goto label_escape_sequence;
1133
1134         case ISO_escape:
1135           ONE_MORE_BYTE (c1);
1136         label_escape_sequence:
1137           /* Escape sequences handled by Emacs are invocation,
1138              designation, direction specification, and character
1139              composition specification.  */
1140           switch (c1)
1141             {
1142             case '&':           /* revision of following character set */
1143               ONE_MORE_BYTE (c1);
1144               if (!(c1 >= '@' && c1 <= '~'))
1145                 goto label_invalid_code;
1146               ONE_MORE_BYTE (c1);
1147               if (c1 != ISO_CODE_ESC)
1148                 goto label_invalid_code;
1149               ONE_MORE_BYTE (c1);
1150               goto label_escape_sequence;
1151
1152             case '$':           /* designation of 2-byte character set */
1153               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1154                 goto label_invalid_code;
1155               ONE_MORE_BYTE (c1);
1156               if (c1 >= '@' && c1 <= 'B')
1157                 {       /* designation of JISX0208.1978, GB2312.1980,
1158                                    or JISX0208.1980 */
1159                   DECODE_DESIGNATION (0, 2, 94, c1);
1160                 }
1161               else if (c1 >= 0x28 && c1 <= 0x2B)
1162                 {       /* designation of DIMENSION2_CHARS94 character set */
1163                   ONE_MORE_BYTE (c2);
1164                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1165                 }
1166               else if (c1 >= 0x2C && c1 <= 0x2F)
1167                 {       /* designation of DIMENSION2_CHARS96 character set */
1168                   ONE_MORE_BYTE (c2);
1169                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1170                 }
1171               else
1172                 goto label_invalid_code;
1173               break;
1174
1175             case 'n':           /* invocation of locking-shift-2 */
1176               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1177                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1178                 goto label_invalid_code;
1179               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1180               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1181               break;
1182
1183             case 'o':           /* invocation of locking-shift-3 */
1184               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1185                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1186                 goto label_invalid_code;
1187               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1188               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1189               break;
1190
1191             case 'N':           /* invocation of single-shift-2 */
1192               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1193                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1194                 goto label_invalid_code;
1195               ONE_MORE_BYTE (c1);
1196               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1197               DECODE_ISO_CHARACTER (charset, c1);
1198               break;
1199
1200             case 'O':           /* invocation of single-shift-3 */
1201               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1202                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1203                 goto label_invalid_code;
1204               ONE_MORE_BYTE (c1);
1205               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1206               DECODE_ISO_CHARACTER (charset, c1);
1207               break;
1208
1209             case '0': case '2': /* start composing */
1210               /* Before processing composing, we must be sure that all
1211                  characters being composed are supported by CODING.
1212                  If not, we must give up composing and insert the
1213                  bunch of codes for composing as is without decoding.  */
1214               {
1215                 int result1;
1216
1217                 result1 = check_composing_code (coding, src, src_end);
1218                 if (result1 == 0)
1219                   coding->composing = (c1 == '0'
1220                                        ? COMPOSING_NO_RULE_HEAD
1221                                        : COMPOSING_WITH_RULE_HEAD);
1222                 else if (result1 > 0)
1223                   {
1224                     if (result1 + 2 < (dst_bytes ? dst_end : src_base) - dst)
1225                       {
1226                         bcopy (src_base, dst, result1 + 2);
1227                         src += result1;
1228                         dst += result1 + 2;
1229                         coding->produced_char += result1 + 2;
1230                       }
1231                     else
1232                       {
1233                         result = CODING_FINISH_INSUFFICIENT_DST;
1234                         goto label_end_of_loop_2;
1235                       }
1236                   }
1237                 else
1238                   goto label_end_of_loop;
1239               }
1240               break;
1241
1242             case '1':           /* end composing */
1243               coding->composing = COMPOSING_NO;
1244               coding->produced_char++;
1245               break;
1246
1247             case '[':           /* specification of direction */
1248               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1249                 goto label_invalid_code;
1250               /* For the moment, nested direction is not supported.
1251                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1252                  left-to-right, and nozero means right-to-left.  */
1253               ONE_MORE_BYTE (c1);
1254               switch (c1)
1255                 {
1256                 case ']':       /* end of the current direction */
1257                   coding->mode &= ~CODING_MODE_DIRECTION;
1258
1259                 case '0':       /* end of the current direction */
1260                 case '1':       /* start of left-to-right direction */
1261                   ONE_MORE_BYTE (c1);
1262                   if (c1 == ']')
1263                     coding->mode &= ~CODING_MODE_DIRECTION;
1264                   else
1265                     goto label_invalid_code;
1266                   break;
1267
1268                 case '2':       /* start of right-to-left direction */
1269                   ONE_MORE_BYTE (c1);
1270                   if (c1 == ']')
1271                     coding->mode |= CODING_MODE_DIRECTION;
1272                   else
1273                     goto label_invalid_code;
1274                   break;
1275
1276                 default:
1277                   goto label_invalid_code;
1278                 }
1279               break;
1280
1281             default:
1282               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1283                 goto label_invalid_code;
1284               if (c1 >= 0x28 && c1 <= 0x2B)
1285                 {       /* designation of DIMENSION1_CHARS94 character set */
1286                   ONE_MORE_BYTE (c2);
1287                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1288                 }
1289               else if (c1 >= 0x2C && c1 <= 0x2F)
1290                 {       /* designation of DIMENSION1_CHARS96 character set */
1291                   ONE_MORE_BYTE (c2);
1292                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1293                 }
1294               else
1295                 {
1296                   goto label_invalid_code;
1297                 }
1298             }
1299           /* We must update these variables now.  */
1300           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1301           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1302           break;
1303
1304         label_invalid_code:
1305           while (src_base < src)
1306             *dst++ = *src_base++;
1307           coding->fake_multibyte = 1;
1308         }
1309       continue;
1310
1311     label_end_of_loop:
1312       result = CODING_FINISH_INSUFFICIENT_SRC;
1313     label_end_of_loop_2:
1314       src = src_base;
1315       break;
1316     }
1317
1318   if (src < src_end)
1319     {
1320       if (result == CODING_FINISH_NORMAL)
1321         result = CODING_FINISH_INSUFFICIENT_DST;
1322       else if (result != CODING_FINISH_INCONSISTENT_EOL
1323                && coding->mode & CODING_MODE_LAST_BLOCK)
1324         {
1325           /* This is the last block of the text to be decoded.  We had
1326              better just flush out all remaining codes in the text
1327              although they are not valid characters.  */
1328           src_bytes = src_end - src;
1329           if (dst_bytes && (dst_end - dst < src_bytes))
1330             src_bytes = dst_end - dst;
1331           bcopy (src, dst, src_bytes);
1332           dst += src_bytes;
1333           src += src_bytes;
1334           coding->fake_multibyte = 1;
1335         }
1336     }
1337
1338   coding->consumed = coding->consumed_char = src - source;
1339   coding->produced = dst - destination;
1340   return result;
1341 }
1342
1343 /* ISO2022 encoding stuff.  */
1344
1345 /*
1346    It is not enough to say just "ISO2022" on encoding, we have to
1347    specify more details.  In Emacs, each coding system of ISO2022
1348    variant has the following specifications:
1349         1. Initial designation to G0 thru G3.
1350         2. Allows short-form designation?
1351         3. ASCII should be designated to G0 before control characters?
1352         4. ASCII should be designated to G0 at end of line?
1353         5. 7-bit environment or 8-bit environment?
1354         6. Use locking-shift?
1355         7. Use Single-shift?
1356    And the following two are only for Japanese:
1357         8. Use ASCII in place of JIS0201-1976-Roman?
1358         9. Use JISX0208-1983 in place of JISX0208-1978?
1359    These specifications are encoded in `coding->flags' as flag bits
1360    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1361    details.
1362 */
1363
1364 /* Produce codes (escape sequence) for designating CHARSET to graphic
1365    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1366    the coding system CODING allows, produce designation sequence of
1367    short-form.  */
1368
1369 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1370   do {                                                                  \
1371     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1372     char *intermediate_char_94 = "()*+";                                \
1373     char *intermediate_char_96 = ",-./";                                \
1374     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1375     if (revision < 255)                                                 \
1376       {                                                                 \
1377         *dst++ = ISO_CODE_ESC;                                          \
1378         *dst++ = '&';                                                   \
1379         *dst++ = '@' + revision;                                        \
1380       }                                                                 \
1381     *dst++ = ISO_CODE_ESC;                                              \
1382     if (CHARSET_DIMENSION (charset) == 1)                               \
1383       {                                                                 \
1384         if (CHARSET_CHARS (charset) == 94)                              \
1385           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1386         else                                                            \
1387           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1388       }                                                                 \
1389     else                                                                \
1390       {                                                                 \
1391         *dst++ = '$';                                                   \
1392         if (CHARSET_CHARS (charset) == 94)                              \
1393           {                                                             \
1394             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1395                 || reg != 0                                             \
1396                 || final_char < '@' || final_char > 'B')                \
1397               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1398           }                                                             \
1399         else                                                            \
1400           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1401       }                                                                 \
1402     *dst++ = final_char;                                                \
1403     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1404   } while (0)
1405
1406 /* The following two macros produce codes (control character or escape
1407    sequence) for ISO2022 single-shift functions (single-shift-2 and
1408    single-shift-3).  */
1409
1410 #define ENCODE_SINGLE_SHIFT_2                           \
1411   do {                                                  \
1412     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1413       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1414     else                                                \
1415       {                                                 \
1416         *dst++ = ISO_CODE_SS2;                          \
1417         coding->fake_multibyte = 1;                     \
1418       }                                                 \
1419     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1420   } while (0)
1421
1422 #define ENCODE_SINGLE_SHIFT_3                           \
1423   do {                                                  \
1424     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1425       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1426     else                                                \
1427       {                                                 \
1428         *dst++ = ISO_CODE_SS3;                          \
1429         coding->fake_multibyte = 1;                     \
1430       }                                                 \
1431     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1432   } while (0)
1433
1434 /* The following four macros produce codes (control character or
1435    escape sequence) for ISO2022 locking-shift functions (shift-in,
1436    shift-out, locking-shift-2, and locking-shift-3).  */
1437
1438 #define ENCODE_SHIFT_IN                         \
1439   do {                                          \
1440     *dst++ = ISO_CODE_SI;                       \
1441     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1442   } while (0)
1443
1444 #define ENCODE_SHIFT_OUT                        \
1445   do {                                          \
1446     *dst++ = ISO_CODE_SO;                       \
1447     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1448   } while (0)
1449
1450 #define ENCODE_LOCKING_SHIFT_2                  \
1451   do {                                          \
1452     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1453     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1454   } while (0)
1455
1456 #define ENCODE_LOCKING_SHIFT_3                  \
1457   do {                                          \
1458     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1459     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1460   } while (0)
1461
1462 /* Produce codes for a DIMENSION1 character whose character set is
1463    CHARSET and whose position-code is C1.  Designation and invocation
1464    sequences are also produced in advance if necessary.  */
1465
1466
1467 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1468   do {                                                                  \
1469     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1470       {                                                                 \
1471         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1472           *dst++ = c1 & 0x7F;                                           \
1473         else                                                            \
1474           *dst++ = c1 | 0x80;                                           \
1475         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1476         break;                                                          \
1477       }                                                                 \
1478     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1479       {                                                                 \
1480         *dst++ = c1 & 0x7F;                                             \
1481         break;                                                          \
1482       }                                                                 \
1483     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1484       {                                                                 \
1485         *dst++ = c1 | 0x80;                                             \
1486         break;                                                          \
1487       }                                                                 \
1488     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1489              && !coding->safe_charsets[charset])                        \
1490       {                                                                 \
1491         /* We should not encode this character, instead produce one or  \
1492            two `?'s.  */                                                \
1493         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1494         if (CHARSET_WIDTH (charset) == 2)                               \
1495           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1496         break;                                                          \
1497       }                                                                 \
1498     else                                                                \
1499       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1500          must invoke it, or, at first, designate it to some graphic     \
1501          register.  Then repeat the loop to actually produce the        \
1502          character.  */                                                 \
1503       dst = encode_invocation_designation (charset, coding, dst);       \
1504   } while (1)
1505
1506 /* Produce codes for a DIMENSION2 character whose character set is
1507    CHARSET and whose position-codes are C1 and C2.  Designation and
1508    invocation codes are also produced in advance if necessary.  */
1509
1510 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1511   do {                                                                  \
1512     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1513       {                                                                 \
1514         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1515           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1516         else                                                            \
1517           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1518         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1519         break;                                                          \
1520       }                                                                 \
1521     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1522       {                                                                 \
1523         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1524         break;                                                          \
1525       }                                                                 \
1526     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1527       {                                                                 \
1528         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1529         break;                                                          \
1530       }                                                                 \
1531     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1532              && !coding->safe_charsets[charset])                        \
1533       {                                                                 \
1534         /* We should not encode this character, instead produce one or  \
1535            two `?'s.  */                                                \
1536         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1537         if (CHARSET_WIDTH (charset) == 2)                               \
1538           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1539         break;                                                          \
1540       }                                                                 \
1541     else                                                                \
1542       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1543          must invoke it, or, at first, designate it to some graphic     \
1544          register.  Then repeat the loop to actually produce the        \
1545          character.  */                                                 \
1546       dst = encode_invocation_designation (charset, coding, dst);       \
1547   } while (1)
1548
1549 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1550   do {                                                                    \
1551     int c_alt, charset_alt;                                               \
1552     if (!NILP (unification_table)                                         \
1553         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1554             >= 0))                                                        \
1555       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1556     else                                                                  \
1557       charset_alt = charset;                                              \
1558     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1559       {                                                                   \
1560         if (charset == CHARSET_ASCII                                      \
1561             && coding->flags & CODING_FLAG_ISO_USE_ROMAN)                 \
1562           charset_alt = charset_latin_jisx0201;                           \
1563         ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                \
1564       }                                                                   \
1565     else                                                                  \
1566       {                                                                   \
1567         if (charset == charset_jisx0208                                   \
1568             && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)                \
1569           charset_alt = charset_jisx0208_1978;                            \
1570         ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);            \
1571       }                                                                   \
1572     if (! COMPOSING_P (coding->composing))                                \
1573       coding->consumed_char++;                                            \
1574      } while (0)
1575
1576 /* Produce designation and invocation codes at a place pointed by DST
1577    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1578    Return new DST.  */
1579
1580 unsigned char *
1581 encode_invocation_designation (charset, coding, dst)
1582      int charset;
1583      struct coding_system *coding;
1584      unsigned char *dst;
1585 {
1586   int reg;                      /* graphic register number */
1587
1588   /* At first, check designations.  */
1589   for (reg = 0; reg < 4; reg++)
1590     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1591       break;
1592
1593   if (reg >= 4)
1594     {
1595       /* CHARSET is not yet designated to any graphic registers.  */
1596       /* At first check the requested designation.  */
1597       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1598       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1599         /* Since CHARSET requests no special designation, designate it
1600            to graphic register 0.  */
1601         reg = 0;
1602
1603       ENCODE_DESIGNATION (charset, reg, coding);
1604     }
1605
1606   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1607       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1608     {
1609       /* Since the graphic register REG is not invoked to any graphic
1610          planes, invoke it to graphic plane 0.  */
1611       switch (reg)
1612         {
1613         case 0:                 /* graphic register 0 */
1614           ENCODE_SHIFT_IN;
1615           break;
1616
1617         case 1:                 /* graphic register 1 */
1618           ENCODE_SHIFT_OUT;
1619           break;
1620
1621         case 2:                 /* graphic register 2 */
1622           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1623             ENCODE_SINGLE_SHIFT_2;
1624           else
1625             ENCODE_LOCKING_SHIFT_2;
1626           break;
1627
1628         case 3:                 /* graphic register 3 */
1629           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1630             ENCODE_SINGLE_SHIFT_3;
1631           else
1632             ENCODE_LOCKING_SHIFT_3;
1633           break;
1634         }
1635     }
1636   return dst;
1637 }
1638
1639 /* The following two macros produce codes for indicating composition.  */
1640 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1641 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1642 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1643
1644 /* The following three macros produce codes for indicating direction
1645    of text.  */
1646 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1647   do {                                                  \
1648     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1649       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1650     else                                                \
1651       *dst++ = ISO_CODE_CSI;                            \
1652   } while (0)
1653
1654 #define ENCODE_DIRECTION_R2L    \
1655   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1656
1657 #define ENCODE_DIRECTION_L2R    \
1658   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1659
1660 /* Produce codes for designation and invocation to reset the graphic
1661    planes and registers to initial state.  */
1662 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1663   do {                                                                      \
1664     int reg;                                                                \
1665     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1666       ENCODE_SHIFT_IN;                                                      \
1667     for (reg = 0; reg < 4; reg++)                                           \
1668       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1669           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1670               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1671         ENCODE_DESIGNATION                                                  \
1672           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1673   } while (0)
1674
1675 /* Produce designation sequences of charsets in the line started from
1676    SRC to a place pointed by *DSTP, and update DSTP.
1677
1678    If the current block ends before any end-of-line, we may fail to
1679    find all the necessary designations.  */
1680
1681 encode_designation_at_bol (coding, table, src, src_end, dstp)
1682      struct coding_system *coding;
1683      Lisp_Object table;
1684      unsigned char *src, *src_end, **dstp;
1685 {
1686   int charset, c, found = 0, reg;
1687   /* Table of charsets to be designated to each graphic register.  */
1688   int r[4];
1689   unsigned char *dst = *dstp;
1690
1691   for (reg = 0; reg < 4; reg++)
1692     r[reg] = -1;
1693
1694   while (src < src_end && *src != '\n' && found < 4)
1695     {
1696       int bytes = BYTES_BY_CHAR_HEAD (*src);
1697
1698       if (NILP (table))
1699         charset = CHARSET_AT (src);
1700       else
1701         {
1702           int c_alt;
1703           unsigned char c1, c2;
1704
1705           SPLIT_STRING(src, bytes, charset, c1, c2);
1706           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1707             charset = CHAR_CHARSET (c_alt);
1708         }
1709
1710       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1711       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1712         {
1713           found++;
1714           r[reg] = charset;
1715         }
1716
1717       src += bytes;
1718     }
1719
1720   if (found)
1721     {
1722       for (reg = 0; reg < 4; reg++)
1723         if (r[reg] >= 0
1724             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1725           ENCODE_DESIGNATION (r[reg], reg, coding);
1726       *dstp = dst;
1727     }
1728 }
1729
1730 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1731
1732 int
1733 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1734      struct coding_system *coding;
1735      unsigned char *source, *destination;
1736      int src_bytes, dst_bytes;
1737 {
1738   unsigned char *src = source;
1739   unsigned char *src_end = source + src_bytes;
1740   unsigned char *dst = destination;
1741   unsigned char *dst_end = destination + dst_bytes;
1742   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1743      from DST_END to assure overflow checking is necessary only at the
1744      head of loop.  */
1745   unsigned char *adjusted_dst_end = dst_end - 19;
1746   Lisp_Object unification_table
1747       = coding->character_unification_table_for_encode;
1748   int result = CODING_FINISH_NORMAL;
1749
1750   if (!NILP (Venable_character_unification) && NILP (unification_table))
1751     unification_table = Vstandard_character_unification_table_for_encode;
1752
1753   coding->consumed_char = 0;
1754   coding->fake_multibyte = 0;
1755   while (src < src_end && (dst_bytes
1756                            ? (dst < adjusted_dst_end)
1757                            : (dst < src - 19)))
1758     {
1759       /* SRC_BASE remembers the start position in source in each loop.
1760          The loop will be exited when there's not enough source text
1761          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1762          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1763          reset to SRC_BASE before exiting.  */
1764       unsigned char *src_base = src;
1765       int charset, c1, c2, c3, c4;
1766
1767       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1768           && CODING_SPEC_ISO_BOL (coding))
1769         {
1770           /* We have to produce designation sequences if any now.  */
1771           encode_designation_at_bol (coding, unification_table,
1772                                      src, src_end, &dst);
1773           CODING_SPEC_ISO_BOL (coding) = 0;
1774         }
1775
1776       c1 = *src++;
1777       /* If we are seeing a component of a composite character, we are
1778          seeing a leading-code encoded irregularly for composition, or
1779          a composition rule if composing with rule.  We must set C1 to
1780          a normal leading-code or an ASCII code.  If we are not seeing
1781          a composite character, we must reset composition,
1782          designation, and invocation states.  */
1783       if (COMPOSING_P (coding->composing))
1784         {
1785           if (c1 < 0xA0)
1786             {
1787               /* We are not in a composite character any longer.  */
1788               coding->composing = COMPOSING_NO;
1789               ENCODE_RESET_PLANE_AND_REGISTER;
1790               ENCODE_COMPOSITION_END;
1791             }
1792           else
1793             {
1794               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1795                 {
1796                   *dst++ = c1 & 0x7F;
1797                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1798                   continue;
1799                 }
1800               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1801                 coding->composing = COMPOSING_WITH_RULE_RULE;
1802               if (c1 == 0xA0)
1803                 {
1804                   /* This is an ASCII component.  */
1805                   ONE_MORE_BYTE (c1);
1806                   c1 &= 0x7F;
1807                 }
1808               else
1809                 /* This is a leading-code of non ASCII component.  */
1810                 c1 -= 0x20;
1811             }
1812         }
1813
1814       /* Now encode one character.  C1 is a control character, an
1815          ASCII character, or a leading-code of multi-byte character.  */
1816       switch (emacs_code_class[c1])
1817         {
1818         case EMACS_ascii_code:
1819           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1820           break;
1821
1822         case EMACS_control_code:
1823           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1824             ENCODE_RESET_PLANE_AND_REGISTER;
1825           *dst++ = c1;
1826           coding->consumed_char++;
1827           break;
1828
1829         case EMACS_carriage_return_code:
1830           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
1831             {
1832               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1833                 ENCODE_RESET_PLANE_AND_REGISTER;
1834               *dst++ = c1;
1835               coding->consumed_char++;
1836               break;
1837             }
1838           /* fall down to treat '\r' as '\n' ...  */
1839
1840         case EMACS_linefeed_code:
1841           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1842             ENCODE_RESET_PLANE_AND_REGISTER;
1843           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1844             bcopy (coding->spec.iso2022.initial_designation,
1845                    coding->spec.iso2022.current_designation,
1846                    sizeof coding->spec.iso2022.initial_designation);
1847           if (coding->eol_type == CODING_EOL_LF
1848               || coding->eol_type == CODING_EOL_UNDECIDED)
1849             *dst++ = ISO_CODE_LF;
1850           else if (coding->eol_type == CODING_EOL_CRLF)
1851             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1852           else
1853             *dst++ = ISO_CODE_CR;
1854           CODING_SPEC_ISO_BOL (coding) = 1;
1855           coding->consumed_char++;
1856           break;
1857
1858         case EMACS_leading_code_2:
1859           ONE_MORE_BYTE (c2);
1860           if (c2 < 0xA0)
1861             {
1862               /* invalid sequence */
1863               *dst++ = c1;
1864               *dst++ = c2;
1865               coding->consumed_char += 2;
1866             }
1867           else
1868             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1869           break;
1870
1871         case EMACS_leading_code_3:
1872           TWO_MORE_BYTES (c2, c3);
1873           if (c2 < 0xA0 || c3 < 0xA0)
1874             {
1875               /* invalid sequence */
1876               *dst++ = c1;
1877               *dst++ = c2;
1878               *dst++ = c3;
1879               coding->consumed_char += 3;
1880             }
1881           else if (c1 < LEADING_CODE_PRIVATE_11)
1882             ENCODE_ISO_CHARACTER (c1, c2, c3);
1883           else
1884             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1885           break;
1886
1887         case EMACS_leading_code_4:
1888           THREE_MORE_BYTES (c2, c3, c4);
1889           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1890             {
1891               /* invalid sequence */
1892               *dst++ = c1;
1893               *dst++ = c2;
1894               *dst++ = c3;
1895               *dst++ = c4;
1896               coding->consumed_char += 4;
1897             }
1898           else
1899             ENCODE_ISO_CHARACTER (c2, c3, c4);
1900           break;
1901
1902         case EMACS_leading_code_composition:
1903           ONE_MORE_BYTE (c2);
1904           if (c2 < 0xA0)
1905             {
1906               /* invalid sequence */
1907               *dst++ = c1;
1908               *dst++ = c2;
1909               coding->consumed_char += 2;
1910             }
1911           else if (c2 == 0xFF)
1912             {
1913               ENCODE_RESET_PLANE_AND_REGISTER;
1914               coding->composing = COMPOSING_WITH_RULE_HEAD;
1915               ENCODE_COMPOSITION_WITH_RULE_START;
1916               coding->consumed_char++;
1917             }
1918           else
1919             {
1920               ENCODE_RESET_PLANE_AND_REGISTER;
1921               /* Rewind one byte because it is a character code of
1922                  composition elements.  */
1923               src--;
1924               coding->composing = COMPOSING_NO_RULE_HEAD;
1925               ENCODE_COMPOSITION_NO_RULE_START;
1926               coding->consumed_char++;
1927             }
1928           break;
1929
1930         case EMACS_invalid_code:
1931           *dst++ = c1;
1932           coding->consumed_char++;
1933           break;
1934         }
1935       continue;
1936     label_end_of_loop:
1937       result = CODING_FINISH_INSUFFICIENT_SRC;
1938       src = src_base;
1939       break;
1940     }
1941
1942   if (src < src_end)
1943     {
1944       if (result == CODING_FINISH_NORMAL)
1945         result = CODING_FINISH_INSUFFICIENT_DST;
1946       else
1947         /* If this is the last block of the text to be encoded, we
1948            must reset graphic planes and registers to the initial
1949            state, and flush out the carryover if any.  */
1950         if (coding->mode & CODING_MODE_LAST_BLOCK)
1951           ENCODE_RESET_PLANE_AND_REGISTER;
1952     }
1953
1954   coding->consumed = src - source;
1955   coding->produced = coding->produced_char = dst - destination;
1956   return result;
1957 }
1958
1959 \f
1960 /*** 4. SJIS and BIG5 handlers ***/
1961
1962 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1963    quite widely.  So, for the moment, Emacs supports them in the bare
1964    C code.  But, in the future, they may be supported only by CCL.  */
1965
1966 /* SJIS is a coding system encoding three character sets: ASCII, right
1967    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1968    as is.  A character of charset katakana-jisx0201 is encoded by
1969    "position-code + 0x80".  A character of charset japanese-jisx0208
1970    is encoded in 2-byte but two position-codes are divided and shifted
1971    so that it fit in the range below.
1972
1973    --- CODE RANGE of SJIS ---
1974    (character set)      (range)
1975    ASCII                0x00 .. 0x7F
1976    KATAKANA-JISX0201    0xA0 .. 0xDF
1977    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1978             (2nd byte)  0x40 .. 0xFF
1979    -------------------------------
1980
1981 */
1982
1983 /* BIG5 is a coding system encoding two character sets: ASCII and
1984    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1985    character set and is encoded in two-byte.
1986
1987    --- CODE RANGE of BIG5 ---
1988    (character set)      (range)
1989    ASCII                0x00 .. 0x7F
1990    Big5 (1st byte)      0xA1 .. 0xFE
1991         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1992    --------------------------
1993
1994    Since the number of characters in Big5 is larger than maximum
1995    characters in Emacs' charset (96x96), it can't be handled as one
1996    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1997    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1998    contains frequently used characters and the latter contains less
1999    frequently used characters.  */
2000
2001 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2002    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2003    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2004    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2005
2006 /* Number of Big5 characters which have the same code in 1st byte.  */
2007 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2008
2009 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2010   do {                                                                  \
2011     unsigned int temp                                                   \
2012       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2013     if (b1 < 0xC9)                                                      \
2014       charset = charset_big5_1;                                         \
2015     else                                                                \
2016       {                                                                 \
2017         charset = charset_big5_2;                                       \
2018         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2019       }                                                                 \
2020     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2021     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2022   } while (0)
2023
2024 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2025   do {                                                                  \
2026     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2027     if (charset == charset_big5_2)                                      \
2028       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2029     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2030     b2 = temp % BIG5_SAME_ROW;                                          \
2031     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2032   } while (0)
2033
2034 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
2035   do {                                                                  \
2036     int c_alt, charset_alt = (charset);                                 \
2037     if (!NILP (unification_table)                                       \
2038         && ((c_alt = unify_char (unification_table,                     \
2039                                  -1, (charset), c1, c2)) >= 0))         \
2040           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
2041     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
2042       DECODE_CHARACTER_ASCII (c1);                                      \
2043     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
2044       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
2045     else                                                                \
2046       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
2047   } while (0)
2048
2049 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
2050   do {                                                                    \
2051     int c_alt, charset_alt;                                               \
2052     if (!NILP (unification_table)                                         \
2053         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
2054             >= 0))                                                        \
2055       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
2056     else                                                                  \
2057       charset_alt = charset;                                              \
2058     if (charset_alt == charset_ascii)                                     \
2059       *dst++ = c1;                                                        \
2060     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
2061       {                                                                   \
2062         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
2063           *dst++ = c1;                                                    \
2064         else                                                              \
2065           {                                                               \
2066             *dst++ = charset_alt, *dst++ = c1;                            \
2067             coding->fake_multibyte = 1;                                   \
2068           }                                                               \
2069       }                                                                   \
2070     else                                                                  \
2071       {                                                                   \
2072         c1 &= 0x7F, c2 &= 0x7F;                                           \
2073         if (sjis_p && charset_alt == charset_jisx0208)                    \
2074           {                                                               \
2075             unsigned char s1, s2;                                         \
2076                                                                           \
2077             ENCODE_SJIS (c1, c2, s1, s2);                                 \
2078             *dst++ = s1, *dst++ = s2;                                     \
2079             coding->fake_multibyte = 1;                                   \
2080           }                                                               \
2081         else if (!sjis_p                                                  \
2082                  && (charset_alt == charset_big5_1                        \
2083                      || charset_alt == charset_big5_2))                   \
2084           {                                                               \
2085             unsigned char b1, b2;                                         \
2086                                                                           \
2087             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
2088             *dst++ = b1, *dst++ = b2;                                     \
2089           }                                                               \
2090         else                                                              \
2091           {                                                               \
2092             *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;               \
2093             coding->fake_multibyte = 1;                                   \
2094           }                                                               \
2095       }                                                                   \
2096     coding->consumed_char++;                                              \
2097   } while (0);
2098
2099 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2100    Check if a text is encoded in SJIS.  If it is, return
2101    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2102
2103 int
2104 detect_coding_sjis (src, src_end)
2105      unsigned char *src, *src_end;
2106 {
2107   unsigned char c;
2108
2109   while (src < src_end)
2110     {
2111       c = *src++;
2112       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2113         {
2114           if (src < src_end && *src++ < 0x40)
2115             return 0;
2116         }
2117     }
2118   return CODING_CATEGORY_MASK_SJIS;
2119 }
2120
2121 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2122    Check if a text is encoded in BIG5.  If it is, return
2123    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2124
2125 int
2126 detect_coding_big5 (src, src_end)
2127      unsigned char *src, *src_end;
2128 {
2129   unsigned char c;
2130
2131   while (src < src_end)
2132     {
2133       c = *src++;
2134       if (c >= 0xA1)
2135         {
2136           if (src >= src_end)
2137             break;
2138           c = *src++;
2139           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2140             return 0;
2141         }
2142     }
2143   return CODING_CATEGORY_MASK_BIG5;
2144 }
2145
2146 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2147    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2148
2149 int
2150 decode_coding_sjis_big5 (coding, source, destination,
2151                          src_bytes, dst_bytes, sjis_p)
2152      struct coding_system *coding;
2153      unsigned char *source, *destination;
2154      int src_bytes, dst_bytes;
2155      int sjis_p;
2156 {
2157   unsigned char *src = source;
2158   unsigned char *src_end = source + src_bytes;
2159   unsigned char *dst = destination;
2160   unsigned char *dst_end = destination + dst_bytes;
2161   /* Since the maximum bytes produced by each loop is 4, we subtract 3
2162      from DST_END to assure overflow checking is necessary only at the
2163      head of loop.  */
2164   unsigned char *adjusted_dst_end = dst_end - 3;
2165   Lisp_Object unification_table
2166       = coding->character_unification_table_for_decode;
2167   int result = CODING_FINISH_NORMAL;
2168
2169   if (!NILP (Venable_character_unification) && NILP (unification_table))
2170     unification_table = Vstandard_character_unification_table_for_decode;
2171
2172   coding->produced_char = 0;
2173   coding->fake_multibyte = 0;
2174   while (src < src_end && (dst_bytes
2175                            ? (dst < adjusted_dst_end)
2176                            : (dst < src - 3)))
2177     {
2178       /* SRC_BASE remembers the start position in source in each loop.
2179          The loop will be exited when there's not enough source text
2180          to analyze two-byte character (within macro ONE_MORE_BYTE).
2181          In that case, SRC is reset to SRC_BASE before exiting.  */
2182       unsigned char *src_base = src;
2183       unsigned char c1 = *src++, c2, c3, c4;
2184
2185       if (c1 < 0x20)
2186         {
2187           if (c1 == '\r')
2188             {
2189               if (coding->eol_type == CODING_EOL_CRLF)
2190                 {
2191                   ONE_MORE_BYTE (c2);
2192                   if (c2 == '\n')
2193                     *dst++ = c2;
2194                   else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2195                     {
2196                       result = CODING_FINISH_INCONSISTENT_EOL;
2197                       goto label_end_of_loop_2;
2198                     }
2199                   else
2200                     /* To process C2 again, SRC is subtracted by 1.  */
2201                     *dst++ = c1, src--;
2202                 }
2203               else if (coding->eol_type == CODING_EOL_CR)
2204                 *dst++ = '\n';
2205               else
2206                 *dst++ = c1;
2207             }
2208           else if (c1 == '\n'
2209                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2210                    && (coding->eol_type == CODING_EOL_CR
2211                        || coding->eol_type == CODING_EOL_CRLF))
2212             {
2213               result = CODING_FINISH_INCONSISTENT_EOL;
2214               goto label_end_of_loop_2;
2215             }
2216           else
2217             *dst++ = c1;
2218           coding->produced_char++;
2219         }
2220       else if (c1 < 0x80)
2221         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2222       else if (c1 < 0xA0)
2223         {
2224           /* SJIS -> JISX0208 */
2225           if (sjis_p)
2226             {
2227               ONE_MORE_BYTE (c2);
2228               if (c2 >= 0x40)
2229                 {
2230                   DECODE_SJIS (c1, c2, c3, c4);
2231                   DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2232                 }
2233               else
2234                 goto label_invalid_code_2;
2235             }
2236           else
2237             goto label_invalid_code_1;
2238         }
2239       else if (c1 < 0xE0)
2240         {
2241           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
2242           if (sjis_p)
2243             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2244                                         /* dummy */ c2);
2245           else
2246             {
2247               int charset;
2248
2249               ONE_MORE_BYTE (c2);
2250               if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2251                 {
2252                   DECODE_BIG5 (c1, c2, charset, c3, c4);
2253                   DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2254                 }
2255               else
2256                 goto label_invalid_code_2;
2257             }
2258         }
2259       else                      /* C1 >= 0xE0 */
2260         {
2261           /* SJIS -> JISX0208, BIG5 -> Big5 */
2262           if (sjis_p)
2263             {
2264               ONE_MORE_BYTE (c2);
2265               if (c2 >= 0x40)
2266                 {
2267                   DECODE_SJIS (c1, c2, c3, c4);
2268                   DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2269                 }
2270               else
2271                 goto label_invalid_code_2;
2272             }
2273           else
2274             {
2275               int charset;
2276
2277               ONE_MORE_BYTE (c2);
2278               if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2279                 {
2280                   DECODE_BIG5 (c1, c2, charset, c3, c4);
2281                   DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2282                 }
2283               else
2284                 goto label_invalid_code_2;
2285             }
2286         }
2287       continue;
2288
2289     label_invalid_code_1:
2290       *dst++ = c1;
2291       coding->produced_char++;
2292       coding->fake_multibyte = 1;
2293       continue;
2294
2295     label_invalid_code_2:
2296       *dst++ = c1; *dst++= c2;
2297       coding->produced_char += 2;
2298       coding->fake_multibyte = 1;
2299       continue;
2300
2301     label_end_of_loop:
2302       result = CODING_FINISH_INSUFFICIENT_SRC;
2303     label_end_of_loop_2:
2304       src = src_base;
2305       break;
2306     }
2307
2308   if (src < src_end)
2309     {
2310       if (result == CODING_FINISH_NORMAL)
2311         result = CODING_FINISH_INSUFFICIENT_DST;
2312       else if (result != CODING_FINISH_INCONSISTENT_EOL
2313                && coding->mode & CODING_MODE_LAST_BLOCK)
2314         {
2315           src_bytes = src_end - src;
2316           if (dst_bytes && (dst_end - dst < src_bytes))
2317             src_bytes = dst_end - dst;
2318           bcopy (dst, src, src_bytes);
2319           src += src_bytes;
2320           dst += src_bytes;
2321           coding->fake_multibyte = 1;
2322         }
2323     }
2324
2325   coding->consumed = coding->consumed_char = src - source;
2326   coding->produced = dst - destination;
2327   return result;
2328 }
2329
2330 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2331    This function can encode `charset_ascii', `charset_katakana_jisx0201',
2332    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
2333    sure that all these charsets are registered as official charset
2334    (i.e. do not have extended leading-codes).  Characters of other
2335    charsets are produced without any encoding.  If SJIS_P is 1, encode
2336    SJIS text, else encode BIG5 text.  */
2337
2338 int
2339 encode_coding_sjis_big5 (coding, source, destination,
2340                          src_bytes, dst_bytes, sjis_p)
2341      struct coding_system *coding;
2342      unsigned char *source, *destination;
2343      int src_bytes, dst_bytes;
2344      int sjis_p;
2345 {
2346   unsigned char *src = source;
2347   unsigned char *src_end = source + src_bytes;
2348   unsigned char *dst = destination;
2349   unsigned char *dst_end = destination + dst_bytes;
2350   /* Since the maximum bytes produced by each loop is 2, we subtract 1
2351      from DST_END to assure overflow checking is necessary only at the
2352      head of loop.  */
2353   unsigned char *adjusted_dst_end = dst_end - 1;
2354   Lisp_Object unification_table
2355       = coding->character_unification_table_for_encode;
2356   int result = CODING_FINISH_NORMAL;
2357
2358   if (!NILP (Venable_character_unification) && NILP (unification_table))
2359     unification_table = Vstandard_character_unification_table_for_encode;
2360
2361   coding->consumed_char = 0;
2362   coding->fake_multibyte = 0;
2363   while (src < src_end && (dst_bytes
2364                            ? (dst < adjusted_dst_end)
2365                            : (dst < src - 1)))
2366     {
2367       /* SRC_BASE remembers the start position in source in each loop.
2368          The loop will be exited when there's not enough source text
2369          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2370          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2371          before exiting.  */
2372       unsigned char *src_base = src;
2373       unsigned char c1 = *src++, c2, c3, c4;
2374
2375       if (coding->composing)
2376         {
2377           if (c1 == 0xA0)
2378             {
2379               ONE_MORE_BYTE (c1);
2380               c1 &= 0x7F;
2381             }
2382           else if (c1 >= 0xA0)
2383             c1 -= 0x20;
2384           else
2385             coding->composing = 0;
2386         }
2387
2388       switch (emacs_code_class[c1])
2389         {
2390         case EMACS_ascii_code:
2391           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2392           break;
2393
2394         case EMACS_control_code:
2395           *dst++ = c1;
2396           coding->consumed_char++;
2397           break;
2398
2399         case EMACS_carriage_return_code:
2400           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2401             {
2402               *dst++ = c1;
2403               coding->consumed_char++;
2404               break;
2405             }
2406           /* fall down to treat '\r' as '\n' ...  */
2407
2408         case EMACS_linefeed_code:
2409           if (coding->eol_type == CODING_EOL_LF
2410               || coding->eol_type == CODING_EOL_UNDECIDED)
2411             *dst++ = '\n';
2412           else if (coding->eol_type == CODING_EOL_CRLF)
2413             *dst++ = '\r', *dst++ = '\n';
2414           else
2415             *dst++ = '\r';
2416           coding->consumed_char++;
2417           break;
2418
2419         case EMACS_leading_code_2:
2420           ONE_MORE_BYTE (c2);
2421           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2422           break;
2423
2424         case EMACS_leading_code_3:
2425           TWO_MORE_BYTES (c2, c3);
2426           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2427           break;
2428
2429         case EMACS_leading_code_4:
2430           THREE_MORE_BYTES (c2, c3, c4);
2431           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2432           break;
2433
2434         case EMACS_leading_code_composition:
2435           coding->composing = 1;
2436           break;
2437
2438         default:                /* i.e. case EMACS_invalid_code: */
2439           *dst++ = c1;
2440           coding->consumed_char++;
2441         }
2442       continue;
2443
2444     label_end_of_loop:
2445       result = CODING_FINISH_INSUFFICIENT_SRC;
2446       src = src_base;
2447       break;
2448     }
2449
2450   if (result == CODING_FINISH_NORMAL
2451       && src < src_end)
2452     result = CODING_FINISH_INSUFFICIENT_DST;
2453   coding->consumed = src - source;
2454   coding->produced = coding->produced_char = dst - destination;
2455   return result;
2456 }
2457
2458 \f
2459 /*** 5. End-of-line handlers ***/
2460
2461 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2462    This function is called only when `coding->eol_type' is
2463    CODING_EOL_CRLF or CODING_EOL_CR.  */
2464
2465 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2466      struct coding_system *coding;
2467      unsigned char *source, *destination;
2468      int src_bytes, dst_bytes;
2469 {
2470   unsigned char *src = source;
2471   unsigned char *src_end = source + src_bytes;
2472   unsigned char *dst = destination;
2473   unsigned char *dst_end = destination + dst_bytes;
2474   unsigned char c;
2475   int result = CODING_FINISH_NORMAL;
2476
2477   coding->fake_multibyte = 0;
2478
2479   if (src_bytes <= 0)
2480     return result;
2481
2482   switch (coding->eol_type)
2483     {
2484     case CODING_EOL_CRLF:
2485       {
2486         /* Since the maximum bytes produced by each loop is 2, we
2487            subtract 1 from DST_END to assure overflow checking is
2488            necessary only at the head of loop.  */
2489         unsigned char *adjusted_dst_end = dst_end - 1;
2490
2491         while (src < src_end && (dst_bytes
2492                                  ? (dst < adjusted_dst_end)
2493                                  : (dst < src - 1)))
2494           {
2495             unsigned char *src_base = src;
2496
2497             c = *src++;
2498             if (c == '\r')
2499               {
2500                 ONE_MORE_BYTE (c);
2501                 if (c != '\n')
2502                   {
2503                     if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2504                       {
2505                         result = CODING_FINISH_INCONSISTENT_EOL;
2506                         goto label_end_of_loop_2;
2507                       }
2508                     *dst++ = '\r';
2509                     if (BASE_LEADING_CODE_P (c))
2510                       coding->fake_multibyte = 1;
2511                   }
2512                 *dst++ = c;
2513               }
2514             else if (c == '\n'
2515                      && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2516               {
2517                 result = CODING_FINISH_INCONSISTENT_EOL;
2518                 goto label_end_of_loop_2;
2519               }
2520             else
2521               {
2522                 *dst++ = c;
2523                 if (BASE_LEADING_CODE_P (c))
2524                   coding->fake_multibyte = 1;
2525               }
2526             continue;
2527
2528           label_end_of_loop:
2529             result = CODING_FINISH_INSUFFICIENT_SRC;
2530           label_end_of_loop_2:
2531             src = src_base;
2532             break;
2533           }
2534         if (result == CODING_FINISH_NORMAL
2535             && src < src_end)
2536           result = CODING_FINISH_INSUFFICIENT_DST;
2537       }
2538       break;
2539
2540     case CODING_EOL_CR:
2541       if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2542         {
2543           while (src < src_end)
2544             {
2545               if ((c = *src++) == '\n')
2546                 break;
2547               if (BASE_LEADING_CODE_P (c))
2548                 coding->fake_multibyte = 1;
2549             }
2550           if (*--src == '\n')
2551             {
2552               src_bytes = src - source;
2553               result = CODING_FINISH_INCONSISTENT_EOL;
2554             }
2555         }
2556       if (dst_bytes && src_bytes > dst_bytes)
2557         {
2558           result = CODING_FINISH_INSUFFICIENT_DST;
2559           src_bytes = dst_bytes;
2560         }
2561       if (dst_bytes)
2562         bcopy (source, destination, src_bytes);
2563       else
2564         safe_bcopy (source, destination, src_bytes);
2565       src = source + src_bytes;
2566       while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2567       break;
2568
2569     default:                    /* i.e. case: CODING_EOL_LF */
2570       if (dst_bytes && src_bytes > dst_bytes)
2571         {
2572           result = CODING_FINISH_INSUFFICIENT_DST;
2573           src_bytes = dst_bytes;
2574         }
2575       if (dst_bytes)
2576         bcopy (source, destination, src_bytes);
2577       else
2578         safe_bcopy (source, destination, src_bytes);
2579       src += src_bytes;
2580       dst += dst_bytes;
2581       coding->fake_multibyte = 1;
2582       break;
2583     }
2584
2585   coding->consumed = coding->consumed_char = src - source;
2586   coding->produced = coding->produced_char = dst - destination;
2587   return result;
2588 }
2589
2590 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2591    format of end-of-line according to `coding->eol_type'.  If
2592    `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2593    '\r' in source text also means end-of-line.  */
2594
2595 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2596      struct coding_system *coding;
2597      unsigned char *source, *destination;
2598      int src_bytes, dst_bytes;
2599 {
2600   unsigned char *src = source;
2601   unsigned char *dst = destination;
2602   int result = CODING_FINISH_NORMAL;
2603
2604   coding->fake_multibyte = 0;
2605
2606   if (coding->eol_type == CODING_EOL_CRLF)
2607     {
2608       unsigned char c;
2609       unsigned char *src_end = source + src_bytes;
2610       unsigned char *dst_end = destination + dst_bytes;
2611       /* Since the maximum bytes produced by each loop is 2, we
2612          subtract 1 from DST_END to assure overflow checking is
2613          necessary only at the head of loop.  */
2614       unsigned char *adjusted_dst_end = dst_end - 1;
2615
2616       while (src < src_end && (dst_bytes
2617                                ? (dst < adjusted_dst_end)
2618                                : (dst < src - 1)))
2619         {
2620           c = *src++;
2621           if (c == '\n'
2622               || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2623             *dst++ = '\r', *dst++ = '\n';
2624           else
2625             {
2626               *dst++ = c;
2627               if (BASE_LEADING_CODE_P (c))
2628                 coding->fake_multibyte = 1;
2629             }
2630         }
2631       if (src < src_end)
2632         result = CODING_FINISH_INSUFFICIENT_DST;
2633     }
2634   else
2635     {
2636       unsigned char c;
2637
2638       if (dst_bytes && src_bytes > dst_bytes)
2639         {
2640           src_bytes = dst_bytes;
2641           result = CODING_FINISH_INSUFFICIENT_DST;
2642         }
2643       if (dst_bytes)
2644         bcopy (source, destination, src_bytes);
2645       else
2646         {
2647           safe_bcopy (source, destination, src_bytes);
2648           dst_bytes = src_bytes;
2649         }
2650       if (coding->eol_type == CODING_EOL_CRLF)
2651         {
2652           while (src_bytes--)
2653             {
2654               if ((c = *dst++) == '\n')
2655                 dst[-1] = '\r';
2656               else if (BASE_LEADING_CODE_P (c))
2657                   coding->fake_multibyte = 1;
2658             }
2659         }
2660       else
2661         {
2662           if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2663             {
2664               while (src_bytes--)
2665                 if (*dst++ == '\r') dst[-1] = '\n';
2666             }
2667           coding->fake_multibyte = 1;
2668         }
2669       src = source + dst_bytes;
2670       dst = destination + dst_bytes;
2671     }
2672
2673   coding->consumed = coding->consumed_char = src - source;
2674   coding->produced = coding->produced_char = dst - destination;
2675   return result;
2676 }
2677
2678 \f
2679 /*** 6. C library functions ***/
2680
2681 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2682    has a property `coding-system'.  The value of this property is a
2683    vector of length 5 (called as coding-vector).  Among elements of
2684    this vector, the first (element[0]) and the fifth (element[4])
2685    carry important information for decoding/encoding.  Before
2686    decoding/encoding, this information should be set in fields of a
2687    structure of type `coding_system'.
2688
2689    A value of property `coding-system' can be a symbol of another
2690    subsidiary coding-system.  In that case, Emacs gets coding-vector
2691    from that symbol.
2692
2693    `element[0]' contains information to be set in `coding->type'.  The
2694    value and its meaning is as follows:
2695
2696    0 -- coding_type_emacs_mule
2697    1 -- coding_type_sjis
2698    2 -- coding_type_iso2022
2699    3 -- coding_type_big5
2700    4 -- coding_type_ccl encoder/decoder written in CCL
2701    nil -- coding_type_no_conversion
2702    t -- coding_type_undecided (automatic conversion on decoding,
2703                                no-conversion on encoding)
2704
2705    `element[4]' contains information to be set in `coding->flags' and
2706    `coding->spec'.  The meaning varies by `coding->type'.
2707
2708    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2709    of length 32 (of which the first 13 sub-elements are used now).
2710    Meanings of these sub-elements are:
2711
2712    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2713         If the value is an integer of valid charset, the charset is
2714         assumed to be designated to graphic register N initially.
2715
2716         If the value is minus, it is a minus value of charset which
2717         reserves graphic register N, which means that the charset is
2718         not designated initially but should be designated to graphic
2719         register N just before encoding a character in that charset.
2720
2721         If the value is nil, graphic register N is never used on
2722         encoding.
2723
2724    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2725         Each value takes t or nil.  See the section ISO2022 of
2726         `coding.h' for more information.
2727
2728    If `coding->type' is `coding_type_big5', element[4] is t to denote
2729    BIG5-ETen or nil to denote BIG5-HKU.
2730
2731    If `coding->type' takes the other value, element[4] is ignored.
2732
2733    Emacs Lisp's coding system also carries information about format of
2734    end-of-line in a value of property `eol-type'.  If the value is
2735    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2736    means CODING_EOL_CR.  If it is not integer, it should be a vector
2737    of subsidiary coding systems of which property `eol-type' has one
2738    of above values.
2739
2740 */
2741
2742 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2743    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2744    is setup so that no conversion is necessary and return -1, else
2745    return 0.  */
2746
2747 int
2748 setup_coding_system (coding_system, coding)
2749      Lisp_Object coding_system;
2750      struct coding_system *coding;
2751 {
2752   Lisp_Object coding_spec, coding_type, eol_type, plist;
2753   Lisp_Object val;
2754   int i;
2755
2756   /* Initialize some fields required for all kinds of coding systems.  */
2757   coding->symbol = coding_system;
2758   coding->common_flags = 0;
2759   coding->mode = 0;
2760   coding->heading_ascii = -1;
2761   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2762   coding_spec = Fget (coding_system, Qcoding_system);
2763   if (!VECTORP (coding_spec)
2764       || XVECTOR (coding_spec)->size != 5
2765       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2766     goto label_invalid_coding_system;
2767
2768   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2769   if (VECTORP (eol_type))
2770     {
2771       coding->eol_type = CODING_EOL_UNDECIDED;
2772       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2773     }
2774   else if (XFASTINT (eol_type) == 1)
2775     {
2776       coding->eol_type = CODING_EOL_CRLF;
2777       coding->common_flags
2778         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2779     }
2780   else if (XFASTINT (eol_type) == 2)
2781     {
2782       coding->eol_type = CODING_EOL_CR;
2783       coding->common_flags
2784         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2785     }
2786   else
2787     coding->eol_type = CODING_EOL_LF;
2788
2789   coding_type = XVECTOR (coding_spec)->contents[0];
2790   /* Try short cut.  */
2791   if (SYMBOLP (coding_type))
2792     {
2793       if (EQ (coding_type, Qt))
2794         {
2795           coding->type = coding_type_undecided;
2796           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2797         }
2798       else
2799         coding->type = coding_type_no_conversion;
2800       return 0;
2801     }
2802
2803   /* Initialize remaining fields.  */
2804   coding->composing = 0;
2805   coding->character_unification_table_for_decode = Qnil;
2806   coding->character_unification_table_for_encode = Qnil;
2807
2808   /* Get values of coding system properties:
2809      `post-read-conversion', `pre-write-conversion',
2810      `character-unification-table-for-decode',
2811      `character-unification-table-for-encode'.  */
2812   plist = XVECTOR (coding_spec)->contents[3];
2813   coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2814   coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2815   val = Fplist_get (plist, Qcharacter_unification_table_for_decode);
2816   if (SYMBOLP (val))
2817     val = Fget (val, Qcharacter_unification_table_for_decode);
2818   coding->character_unification_table_for_decode
2819     = CHAR_TABLE_P (val) ? val : Qnil;
2820   val = Fplist_get (plist, Qcharacter_unification_table_for_encode);
2821   if (SYMBOLP (val))
2822     val = Fget (val, Qcharacter_unification_table_for_encode);
2823   coding->character_unification_table_for_encode
2824     = CHAR_TABLE_P (val) ? val : Qnil;
2825   val = Fplist_get (plist, Qcoding_category);
2826   if (!NILP (val))
2827     {
2828       val = Fget (val, Qcoding_category_index);
2829       if (INTEGERP (val))
2830         coding->category_idx = XINT (val);
2831       else
2832         goto label_invalid_coding_system;
2833     }
2834   else
2835     goto label_invalid_coding_system;
2836
2837   val = Fplist_get (plist, Qsafe_charsets);
2838   if (EQ (val, Qt))
2839     {
2840       for (i = 0; i <= MAX_CHARSET; i++)
2841         coding->safe_charsets[i] = 1;
2842     }
2843   else
2844     {
2845       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2846       while (CONSP (val))
2847         {
2848           if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2849             coding->safe_charsets[i] = 1;
2850           val = XCONS (val)->cdr;
2851         }
2852     }
2853
2854   switch (XFASTINT (coding_type))
2855     {
2856     case 0:
2857       coding->type = coding_type_emacs_mule;
2858       if (!NILP (coding->post_read_conversion))
2859         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2860       if (!NILP (coding->pre_write_conversion))
2861         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2862       break;
2863
2864     case 1:
2865       coding->type = coding_type_sjis;
2866       coding->common_flags
2867         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2868       break;
2869
2870     case 2:
2871       coding->type = coding_type_iso2022;
2872       coding->common_flags
2873         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2874       {
2875         Lisp_Object val, temp;
2876         Lisp_Object *flags;
2877         int i, charset, reg_bits = 0;
2878
2879         val = XVECTOR (coding_spec)->contents[4];
2880
2881         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2882           goto label_invalid_coding_system;
2883
2884         flags = XVECTOR (val)->contents;
2885         coding->flags
2886           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2887              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2888              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2889              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2890              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2891              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2892              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2893              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2894              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2895              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2896              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2897              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2898              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2899              );
2900
2901         /* Invoke graphic register 0 to plane 0.  */
2902         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2903         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2904         CODING_SPEC_ISO_INVOCATION (coding, 1)
2905           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2906         /* Not single shifting at first.  */
2907         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2908         /* Beginning of buffer should also be regarded as bol. */
2909         CODING_SPEC_ISO_BOL (coding) = 1;
2910
2911         for (charset = 0; charset <= MAX_CHARSET; charset++)
2912           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2913         val = Vcharset_revision_alist;
2914         while (CONSP (val))
2915           {
2916             charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2917             if (charset >= 0
2918                 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2919                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2920               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2921             val = XCONS (val)->cdr;
2922           }
2923
2924         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2925            FLAGS[REG] can be one of below:
2926                 integer CHARSET: CHARSET occupies register I,
2927                 t: designate nothing to REG initially, but can be used
2928                   by any charsets,
2929                 list of integer, nil, or t: designate the first
2930                   element (if integer) to REG initially, the remaining
2931                   elements (if integer) is designated to REG on request,
2932                   if an element is t, REG can be used by any charsets,
2933                 nil: REG is never used.  */
2934         for (charset = 0; charset <= MAX_CHARSET; charset++)
2935           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2936             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2937         for (i = 0; i < 4; i++)
2938           {
2939             if (INTEGERP (flags[i])
2940                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2941                 || (charset = get_charset_id (flags[i])) >= 0)
2942               {
2943                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2944                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2945               }
2946             else if (EQ (flags[i], Qt))
2947               {
2948                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2949                 reg_bits |= 1 << i;
2950                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
2951               }
2952             else if (CONSP (flags[i]))
2953               {
2954                 Lisp_Object tail = flags[i];
2955
2956                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
2957                 if (INTEGERP (XCONS (tail)->car)
2958                     && (charset = XINT (XCONS (tail)->car),
2959                         CHARSET_VALID_P (charset))
2960                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2961                   {
2962                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2963                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2964                   }
2965                 else
2966                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2967                 tail = XCONS (tail)->cdr;
2968                 while (CONSP (tail))
2969                   {
2970                     if (INTEGERP (XCONS (tail)->car)
2971                         && (charset = XINT (XCONS (tail)->car),
2972                             CHARSET_VALID_P (charset))
2973                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2974                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2975                         = i;
2976                     else if (EQ (XCONS (tail)->car, Qt))
2977                       reg_bits |= 1 << i;
2978                     tail = XCONS (tail)->cdr;
2979                   }
2980               }
2981             else
2982               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2983
2984             CODING_SPEC_ISO_DESIGNATION (coding, i)
2985               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2986           }
2987
2988         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2989           {
2990             /* REG 1 can be used only by locking shift in 7-bit env.  */
2991             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2992               reg_bits &= ~2;
2993             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2994               /* Without any shifting, only REG 0 and 1 can be used.  */
2995               reg_bits &= 3;
2996           }
2997
2998         if (reg_bits)
2999           for (charset = 0; charset <= MAX_CHARSET; charset++)
3000             {
3001               if (CHARSET_VALID_P (charset))
3002                 {
3003                   /* There exist some default graphic registers to be
3004                      used CHARSET.  */
3005
3006                   /* We had better avoid designating a charset of
3007                      CHARS96 to REG 0 as far as possible.  */
3008                   if (CHARSET_CHARS (charset) == 96)
3009                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3010                       = (reg_bits & 2
3011                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3012                   else
3013                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3014                       = (reg_bits & 1
3015                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3016                 }
3017             }
3018       }
3019       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3020       coding->spec.iso2022.last_invalid_designation_register = -1;
3021       break;
3022
3023     case 3:
3024       coding->type = coding_type_big5;
3025       coding->common_flags
3026         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3027       coding->flags
3028         = (NILP (XVECTOR (coding_spec)->contents[4])
3029            ? CODING_FLAG_BIG5_HKU
3030            : CODING_FLAG_BIG5_ETEN);
3031       break;
3032
3033     case 4:
3034       coding->type = coding_type_ccl;
3035       coding->common_flags
3036         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3037       {
3038         Lisp_Object val = XVECTOR (coding_spec)->contents[4];
3039         if (CONSP  (val)
3040             && VECTORP (XCONS (val)->car)
3041             && VECTORP (XCONS (val)->cdr))
3042           {
3043             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
3044             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
3045           }
3046         else
3047           goto label_invalid_coding_system;
3048       }
3049       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3050       break;
3051
3052     case 5:
3053       coding->type = coding_type_raw_text;
3054       break;
3055
3056     default:
3057       goto label_invalid_coding_system;
3058     }
3059   return 0;
3060
3061  label_invalid_coding_system:
3062   coding->type = coding_type_no_conversion;
3063   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3064   coding->common_flags = 0;
3065   coding->eol_type = CODING_EOL_LF;
3066   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3067   return -1;
3068 }
3069
3070 /* Emacs has a mechanism to automatically detect a coding system if it
3071    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3072    it's impossible to distinguish some coding systems accurately
3073    because they use the same range of codes.  So, at first, coding
3074    systems are categorized into 7, those are:
3075
3076    o coding-category-emacs-mule
3077
3078         The category for a coding system which has the same code range
3079         as Emacs' internal format.  Assigned the coding-system (Lisp
3080         symbol) `emacs-mule' by default.
3081
3082    o coding-category-sjis
3083
3084         The category for a coding system which has the same code range
3085         as SJIS.  Assigned the coding-system (Lisp
3086         symbol) `japanese-shift-jis' by default.
3087
3088    o coding-category-iso-7
3089
3090         The category for a coding system which has the same code range
3091         as ISO2022 of 7-bit environment.  This doesn't use any locking
3092         shift and single shift functions.  This can encode/decode all
3093         charsets.  Assigned the coding-system (Lisp symbol)
3094         `iso-2022-7bit' by default.
3095
3096    o coding-category-iso-7-tight
3097
3098         Same as coding-category-iso-7 except that this can
3099         encode/decode only the specified charsets.
3100
3101    o coding-category-iso-8-1
3102
3103         The category for a coding system which has the same code range
3104         as ISO2022 of 8-bit environment and graphic plane 1 used only
3105         for DIMENSION1 charset.  This doesn't use any locking shift
3106         and single shift functions.  Assigned the coding-system (Lisp
3107         symbol) `iso-latin-1' by default.
3108
3109    o coding-category-iso-8-2
3110
3111         The category for a coding system which has the same code range
3112         as ISO2022 of 8-bit environment and graphic plane 1 used only
3113         for DIMENSION2 charset.  This doesn't use any locking shift
3114         and single shift functions.  Assigned the coding-system (Lisp
3115         symbol) `japanese-iso-8bit' by default.
3116
3117    o coding-category-iso-7-else
3118
3119         The category for a coding system which has the same code range
3120         as ISO2022 of 7-bit environemnt but uses locking shift or
3121         single shift functions.  Assigned the coding-system (Lisp
3122         symbol) `iso-2022-7bit-lock' by default.
3123
3124    o coding-category-iso-8-else
3125
3126         The category for a coding system which has the same code range
3127         as ISO2022 of 8-bit environemnt but uses locking shift or
3128         single shift functions.  Assigned the coding-system (Lisp
3129         symbol) `iso-2022-8bit-ss2' by default.
3130
3131    o coding-category-big5
3132
3133         The category for a coding system which has the same code range
3134         as BIG5.  Assigned the coding-system (Lisp symbol)
3135         `cn-big5' by default.
3136
3137    o coding-category-binary
3138
3139         The category for a coding system not categorized in any of the
3140         above.  Assigned the coding-system (Lisp symbol)
3141         `no-conversion' by default.
3142
3143    Each of them is a Lisp symbol and the value is an actual
3144    `coding-system's (this is also a Lisp symbol) assigned by a user.
3145    What Emacs does actually is to detect a category of coding system.
3146    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3147    decide only one possible category, it selects a category of the
3148    highest priority.  Priorities of categories are also specified by a
3149    user in a Lisp variable `coding-category-list'.
3150
3151 */
3152
3153 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3154    If it detects possible coding systems, return an integer in which
3155    appropriate flag bits are set.  Flag bits are defined by macros
3156    CODING_CATEGORY_MASK_XXX in `coding.h'.
3157
3158    How many ASCII characters are at the head is returned as *SKIP.  */
3159
3160 static int
3161 detect_coding_mask (source, src_bytes, priorities, skip)
3162      unsigned char *source;
3163      int src_bytes, *priorities, *skip;
3164 {
3165   register unsigned char c;
3166   unsigned char *src = source, *src_end = source + src_bytes;
3167   unsigned int mask = (CODING_CATEGORY_MASK_ISO_7BIT
3168                        | CODING_CATEGORY_MASK_ISO_SHIFT);
3169   int i;
3170
3171   /* At first, skip all ASCII characters and control characters except
3172      for three ISO2022 specific control characters.  */
3173  label_loop_detect_coding:
3174   while (src < src_end)
3175     {
3176       c = *src;
3177       if (c >= 0x80
3178           || ((mask & CODING_CATEGORY_MASK_ISO_7BIT)
3179               && c == ISO_CODE_ESC)
3180           || ((mask & CODING_CATEGORY_MASK_ISO_SHIFT)
3181               && (c == ISO_CODE_SI || c == ISO_CODE_SO)))
3182         break;
3183       src++;
3184     }
3185   *skip = src - source;
3186
3187   if (src >= src_end)
3188     /* We found nothing other than ASCII.  There's nothing to do.  */
3189     return 0;
3190
3191   /* The text seems to be encoded in some multilingual coding system.
3192      Now, try to find in which coding system the text is encoded.  */
3193   if (c < 0x80)
3194     {
3195       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3196       /* C is an ISO2022 specific control code of C0.  */
3197       mask = detect_coding_iso2022 (src, src_end);
3198       if (mask == 0)
3199         {
3200           /* No valid ISO2022 code follows C.  Try again.  */
3201           src++;
3202           mask = (c != ISO_CODE_ESC
3203                   ? CODING_CATEGORY_MASK_ISO_7BIT
3204                   : CODING_CATEGORY_MASK_ISO_SHIFT);
3205           goto label_loop_detect_coding;
3206         }
3207       if (priorities)
3208         goto label_return_highest_only;
3209     }
3210   else
3211     {
3212       int try;
3213
3214       if (c < 0xA0)
3215         {
3216           /* C is the first byte of SJIS character code,
3217              or a leading-code of Emacs' internal format (emacs-mule).  */
3218           try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3219
3220           /* Or, if C is a special latin extra code,
3221              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3222              or is an ISO2022 control-sequence-introducer (CSI),
3223              we should also consider the possibility of ISO2022 codings.  */
3224           if ((VECTORP (Vlatin_extra_code_table)
3225                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3226               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3227               || (c == ISO_CODE_CSI
3228                   && (src < src_end
3229                       && (*src == ']'
3230                           || ((*src == '0' || *src == '1' || *src == '2')
3231                               && src + 1 < src_end
3232                               && src[1] == ']')))))
3233             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3234                      | CODING_CATEGORY_MASK_ISO_8BIT);
3235         }
3236       else
3237         /* C is a character of ISO2022 in graphic plane right,
3238            or a SJIS's 1-byte character code (i.e. JISX0201),
3239            or the first byte of BIG5's 2-byte code.  */
3240         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3241                 | CODING_CATEGORY_MASK_ISO_8BIT
3242                 | CODING_CATEGORY_MASK_SJIS
3243                 | CODING_CATEGORY_MASK_BIG5);
3244
3245       mask = 0;
3246       if (priorities)
3247         {
3248           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3249             {
3250               priorities[i] &= try;
3251               if (priorities[i] & CODING_CATEGORY_MASK_ISO)
3252                 mask = detect_coding_iso2022 (src, src_end);
3253               else if (priorities[i] & CODING_CATEGORY_MASK_SJIS)
3254                 mask = detect_coding_sjis (src, src_end);
3255               else if (priorities[i] & CODING_CATEGORY_MASK_BIG5)
3256                 mask = detect_coding_big5 (src, src_end);
3257               else if (priorities[i] & CODING_CATEGORY_MASK_EMACS_MULE)
3258                 mask = detect_coding_emacs_mule (src, src_end);
3259               if (mask)
3260                 goto label_return_highest_only;
3261             }
3262           return CODING_CATEGORY_MASK_RAW_TEXT;
3263         }
3264       if (try & CODING_CATEGORY_MASK_ISO)
3265         mask |= detect_coding_iso2022 (src, src_end);
3266       if (try & CODING_CATEGORY_MASK_SJIS)
3267         mask |= detect_coding_sjis (src, src_end);
3268       if (try & CODING_CATEGORY_MASK_BIG5)
3269         mask |= detect_coding_big5 (src, src_end);
3270       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3271         mask |= detect_coding_emacs_mule (src, src_end);
3272     }
3273   return (mask | CODING_CATEGORY_MASK_RAW_TEXT);
3274
3275  label_return_highest_only:
3276   for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3277     {
3278       if (mask & priorities[i])
3279         return priorities[i];
3280     }
3281   return CODING_CATEGORY_MASK_RAW_TEXT;
3282 }
3283
3284 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3285    The information of the detected coding system is set in CODING.  */
3286
3287 void
3288 detect_coding (coding, src, src_bytes)
3289      struct coding_system *coding;
3290      unsigned char *src;
3291      int src_bytes;
3292 {
3293   unsigned int idx;
3294   int skip, mask, i;
3295   int priorities[CODING_CATEGORY_IDX_MAX];
3296   Lisp_Object val = Vcoding_category_list;
3297
3298   i = 0;
3299   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
3300     {
3301       if (! SYMBOLP (XCONS (val)->car))
3302         break;
3303       idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
3304       if (idx >= CODING_CATEGORY_IDX_MAX)
3305         break;
3306       priorities[i++] = (1 << idx);
3307       val = XCONS (val)->cdr;
3308     }
3309   /* If coding-category-list is valid and contains all coding
3310      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
3311      the following code saves Emacs from craching.  */
3312   while (i < CODING_CATEGORY_IDX_MAX)
3313     priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
3314
3315   mask = detect_coding_mask (src, src_bytes, priorities, &skip);
3316   coding->heading_ascii = skip;
3317
3318   if (!mask) return;
3319
3320   /* We found a single coding system of the highest priority in MASK.  */
3321   idx = 0;
3322   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3323   if (! mask)
3324     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3325
3326   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3327
3328   if (coding->eol_type != CODING_EOL_UNDECIDED)
3329     {
3330       Lisp_Object tmp = Fget (val, Qeol_type);
3331
3332       if (VECTORP (tmp))
3333         val = XVECTOR (tmp)->contents[coding->eol_type];
3334     }
3335   setup_coding_system (val, coding);
3336   /* Set this again because setup_coding_system reset this member.  */
3337   coding->heading_ascii = skip;
3338 }
3339
3340 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3341    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3342    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3343
3344    How many non-eol characters are at the head is returned as *SKIP.  */
3345
3346 #define MAX_EOL_CHECK_COUNT 3
3347
3348 static int
3349 detect_eol_type (source, src_bytes, skip)
3350      unsigned char *source;
3351      int src_bytes, *skip;
3352 {
3353   unsigned char *src = source, *src_end = src + src_bytes;
3354   unsigned char c;
3355   int total = 0;                /* How many end-of-lines are found so far.  */
3356   int eol_type = CODING_EOL_UNDECIDED;
3357   int this_eol_type;
3358
3359   *skip = 0;
3360
3361   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3362     {
3363       c = *src++;
3364       if (c == '\n' || c == '\r')
3365         {
3366           if (*skip == 0)
3367             *skip = src - 1 - source;
3368           total++;
3369           if (c == '\n')
3370             this_eol_type = CODING_EOL_LF;
3371           else if (src >= src_end || *src != '\n')
3372             this_eol_type = CODING_EOL_CR;
3373           else
3374             this_eol_type = CODING_EOL_CRLF, src++;
3375
3376           if (eol_type == CODING_EOL_UNDECIDED)
3377             /* This is the first end-of-line.  */
3378             eol_type = this_eol_type;
3379           else if (eol_type != this_eol_type)
3380             {
3381               /* The found type is different from what found before.  */
3382               eol_type = CODING_EOL_INCONSISTENT;
3383               break;
3384             }
3385         }
3386     }
3387
3388   if (*skip == 0)
3389     *skip = src_end - source;
3390   return eol_type;
3391 }
3392
3393 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3394    is encoded.  If it detects an appropriate format of end-of-line, it
3395    sets the information in *CODING.  */
3396
3397 void
3398 detect_eol (coding, src, src_bytes)
3399      struct coding_system *coding;
3400      unsigned char *src;
3401      int src_bytes;
3402 {
3403   Lisp_Object val;
3404   int skip;
3405   int eol_type = detect_eol_type (src, src_bytes, &skip);
3406
3407   if (coding->heading_ascii > skip)
3408     coding->heading_ascii = skip;
3409   else
3410     skip = coding->heading_ascii;
3411
3412   if (eol_type == CODING_EOL_UNDECIDED)
3413     return;
3414   if (eol_type == CODING_EOL_INCONSISTENT)
3415     {
3416 #if 0
3417       /* This code is suppressed until we find a better way to
3418          distinguish raw text file and binary file.  */
3419
3420       /* If we have already detected that the coding is raw-text, the
3421          coding should actually be no-conversion.  */
3422       if (coding->type == coding_type_raw_text)
3423         {
3424           setup_coding_system (Qno_conversion, coding);
3425           return;
3426         }
3427       /* Else, let's decode only text code anyway.  */
3428 #endif /* 0 */
3429       eol_type = CODING_EOL_LF;
3430     }
3431
3432   val = Fget (coding->symbol, Qeol_type);
3433   if (VECTORP (val) && XVECTOR (val)->size == 3)
3434     {
3435       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3436       coding->heading_ascii = skip;
3437     }
3438 }
3439
3440 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3441
3442 #define DECODING_BUFFER_MAG(coding)                                          \
3443   (coding->type == coding_type_iso2022                                       \
3444    ? 3                                                                       \
3445    : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3446       ? 2                                                                    \
3447       : (coding->type == coding_type_raw_text                                \
3448          ? 1                                                                 \
3449          : (coding->type == coding_type_ccl                                  \
3450             ? coding->spec.ccl.decoder.buf_magnification                     \
3451             : 2))))
3452
3453 /* Return maximum size (bytes) of a buffer enough for decoding
3454    SRC_BYTES of text encoded in CODING.  */
3455
3456 int
3457 decoding_buffer_size (coding, src_bytes)
3458      struct coding_system *coding;
3459      int src_bytes;
3460 {
3461   return (src_bytes * DECODING_BUFFER_MAG (coding)
3462           + CONVERSION_BUFFER_EXTRA_ROOM);
3463 }
3464
3465 /* Return maximum size (bytes) of a buffer enough for encoding
3466    SRC_BYTES of text to CODING.  */
3467
3468 int
3469 encoding_buffer_size (coding, src_bytes)
3470      struct coding_system *coding;
3471      int src_bytes;
3472 {
3473   int magnification;
3474
3475   if (coding->type == coding_type_ccl)
3476     magnification = coding->spec.ccl.encoder.buf_magnification;
3477   else
3478     magnification = 3;
3479
3480   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3481 }
3482
3483 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3484 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3485 #endif
3486
3487 char *conversion_buffer;
3488 int conversion_buffer_size;
3489
3490 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3491    or decoding.  Sufficient memory is allocated automatically.  If we
3492    run out of memory, return NULL.  */
3493
3494 char *
3495 get_conversion_buffer (size)
3496      int size;
3497 {
3498   if (size > conversion_buffer_size)
3499     {
3500       char *buf;
3501       int real_size = conversion_buffer_size * 2;
3502
3503       while (real_size < size) real_size *= 2;
3504       buf = (char *) xmalloc (real_size);
3505       xfree (conversion_buffer);
3506       conversion_buffer = buf;
3507       conversion_buffer_size = real_size;
3508     }
3509   return conversion_buffer;
3510 }
3511
3512 int
3513 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3514      struct coding_system *coding;
3515      unsigned char *source, *destination;
3516      int src_bytes, dst_bytes, encodep;
3517 {
3518   struct ccl_program *ccl
3519     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3520   int result;
3521
3522   coding->produced = ccl_driver (ccl, source, destination,
3523                                  src_bytes, dst_bytes, &(coding->consumed));
3524   if (encodep)
3525     {
3526       coding->produced_char = coding->produced;
3527       coding->consumed_char
3528         = multibyte_chars_in_text (source, coding->consumed);
3529     }
3530   else
3531     {
3532       coding->produced_char
3533         = multibyte_chars_in_text (destination, coding->produced);
3534       coding->consumed_char = coding->consumed;
3535     }
3536   switch (ccl->status)
3537     {
3538     case CCL_STAT_SUSPEND_BY_SRC:
3539       result = CODING_FINISH_INSUFFICIENT_SRC;
3540       break;
3541     case CCL_STAT_SUSPEND_BY_DST:
3542       result = CODING_FINISH_INSUFFICIENT_DST;
3543       break;
3544     default:
3545       result = CODING_FINISH_NORMAL;
3546       break;
3547     }
3548   return result;
3549 }
3550
3551 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
3552    decoding, it may detect coding system and format of end-of-line if
3553    those are not yet decided.  */
3554
3555 int
3556 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3557      struct coding_system *coding;
3558      unsigned char *source, *destination;
3559      int src_bytes, dst_bytes;
3560 {
3561   int result;
3562
3563   if (src_bytes <= 0)
3564     {
3565       coding->produced = coding->produced_char = 0;
3566       coding->consumed = coding->consumed_char = 0;
3567       coding->fake_multibyte = 0;
3568       return CODING_FINISH_NORMAL;
3569     }
3570
3571   if (coding->type == coding_type_undecided)
3572     detect_coding (coding, source, src_bytes);
3573
3574   if (coding->eol_type == CODING_EOL_UNDECIDED)
3575     detect_eol (coding, source, src_bytes);
3576
3577   switch (coding->type)
3578     {
3579     case coding_type_emacs_mule:
3580     case coding_type_undecided:
3581     case coding_type_raw_text:
3582       if (coding->eol_type == CODING_EOL_LF
3583           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3584         goto label_no_conversion;
3585       result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3586       break;
3587
3588     case coding_type_sjis:
3589       result = decode_coding_sjis_big5 (coding, source, destination,
3590                                         src_bytes, dst_bytes, 1);
3591       break;
3592
3593     case coding_type_iso2022:
3594       result = decode_coding_iso2022 (coding, source, destination,
3595                                       src_bytes, dst_bytes);
3596       break;
3597
3598     case coding_type_big5:
3599       result = decode_coding_sjis_big5 (coding, source, destination,
3600                                         src_bytes, dst_bytes, 0);
3601       break;
3602
3603     case coding_type_ccl:
3604       result = ccl_coding_driver (coding, source, destination,
3605                                   src_bytes, dst_bytes, 0);
3606       break;
3607
3608     default:                    /* i.e. case coding_type_no_conversion: */
3609     label_no_conversion:
3610       if (dst_bytes && src_bytes > dst_bytes)
3611         {
3612           coding->produced = dst_bytes;
3613           result = CODING_FINISH_INSUFFICIENT_DST;
3614         }
3615       else
3616         {
3617           coding->produced = src_bytes;
3618           result = CODING_FINISH_NORMAL;
3619         }
3620       if (dst_bytes)
3621         bcopy (source, destination, coding->produced);
3622       else
3623         safe_bcopy (source, destination, coding->produced);
3624       coding->fake_multibyte = 1;
3625       coding->consumed
3626         = coding->consumed_char = coding->produced_char = coding->produced;
3627       break;
3628     }
3629
3630   return result;
3631 }
3632
3633 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
3634
3635 int
3636 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3637      struct coding_system *coding;
3638      unsigned char *source, *destination;
3639      int src_bytes, dst_bytes;
3640 {
3641   int result;
3642
3643   if (src_bytes <= 0)
3644     {
3645       coding->produced = coding->produced_char = 0;
3646       coding->consumed = coding->consumed_char = 0;
3647       coding->fake_multibyte = 0;
3648       return CODING_FINISH_NORMAL;
3649     }
3650
3651   switch (coding->type)
3652     {
3653     case coding_type_emacs_mule:
3654     case coding_type_undecided:
3655     case coding_type_raw_text:
3656       if (coding->eol_type == CODING_EOL_LF
3657           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3658         goto label_no_conversion;
3659       result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
3660       break;
3661
3662     case coding_type_sjis:
3663       result = encode_coding_sjis_big5 (coding, source, destination,
3664                                         src_bytes, dst_bytes, 1);
3665       break;
3666
3667     case coding_type_iso2022:
3668       result = encode_coding_iso2022 (coding, source, destination,
3669                                       src_bytes, dst_bytes);
3670       break;
3671
3672     case coding_type_big5:
3673       result = encode_coding_sjis_big5 (coding, source, destination,
3674                                         src_bytes, dst_bytes, 0);
3675       break;
3676
3677     case coding_type_ccl:
3678       result = ccl_coding_driver (coding, source, destination,
3679                                   src_bytes, dst_bytes, 1);
3680       break;
3681
3682     default:                    /* i.e. case coding_type_no_conversion: */
3683     label_no_conversion:
3684       if (dst_bytes && src_bytes > dst_bytes)
3685         {
3686           coding->produced = dst_bytes;
3687           result = CODING_FINISH_INSUFFICIENT_DST;
3688         }
3689       else
3690         {
3691           coding->produced = src_bytes;
3692           result = CODING_FINISH_NORMAL;
3693         }
3694       if (dst_bytes)
3695         bcopy (source, destination, coding->produced);
3696       else
3697         safe_bcopy (source, destination, coding->produced);
3698       if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3699         {
3700           unsigned char *p = destination, *pend = p + coding->produced;
3701           while (p < pend)
3702             if (*p++ == '\015') p[-1] = '\n';
3703         }
3704       coding->fake_multibyte = 1;
3705       coding->consumed
3706         = coding->consumed_char = coding->produced_char = coding->produced;
3707       break;
3708     }
3709
3710   return result;
3711 }
3712
3713 /* Scan text in the region between *BEG and *END (byte positions),
3714    skip characters which we don't have to decode by coding system
3715    CODING at the head and tail, then set *BEG and *END to the region
3716    of the text we actually have to convert.  The caller should move
3717    the gap out of the region in advance.
3718
3719    If STR is not NULL, *BEG and *END are indices into STR.  */
3720
3721 static void
3722 shrink_decoding_region (beg, end, coding, str)
3723      int *beg, *end;
3724      struct coding_system *coding;
3725      unsigned char *str;
3726 {
3727   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
3728   int eol_conversion;
3729
3730   if (coding->type == coding_type_ccl
3731       || coding->type == coding_type_undecided
3732       || !NILP (coding->post_read_conversion))
3733     {
3734       /* We can't skip any data.  */
3735       return;
3736     }
3737   else if (coding->type == coding_type_no_conversion)
3738     {
3739       /* We need no conversion, but don't have to skip any data here.
3740          Decoding routine handles them effectively anyway.  */
3741       return;
3742     }
3743
3744   if (coding->heading_ascii >= 0)
3745     /* Detection routine has already found how much we can skip at the
3746        head.  */
3747     *beg += coding->heading_ascii;
3748
3749   if (str)
3750     {
3751       begp_orig = begp = str + *beg;
3752       endp_orig = endp = str + *end;
3753     }
3754   else
3755     {
3756       begp_orig = begp = BYTE_POS_ADDR (*beg);
3757       endp_orig = endp = begp + *end - *beg;
3758     }
3759
3760   eol_conversion = (coding->eol_type != CODING_EOL_LF);
3761
3762   switch (coding->type)
3763     {
3764     case coding_type_emacs_mule:
3765     case coding_type_raw_text:
3766       if (eol_conversion)
3767         {
3768           if (coding->heading_ascii < 0)
3769             while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
3770           while (begp < endp && *(endp - 1) != '\r' && *(endp - 1) < 0x80)
3771             endp--;
3772         }
3773       else
3774         begp = endp;
3775       break;
3776
3777     case coding_type_sjis:
3778     case coding_type_big5:
3779       /* We can skip all ASCII characters at the head.  */
3780       if (coding->heading_ascii < 0)
3781         {
3782           if (eol_conversion)
3783             while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
3784           else
3785             while (begp < endp && *begp < 0x80) begp++;
3786         }
3787       /* We can skip all ASCII characters at the tail except for the
3788          second byte of SJIS or BIG5 code.  */
3789       if (eol_conversion)
3790         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
3791       else
3792         while (begp < endp && endp[-1] < 0x80) endp--;
3793       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3794         endp++;
3795       break;
3796
3797     default:            /* i.e. case coding_type_iso2022: */
3798       if (coding->heading_ascii < 0)
3799         {
3800           /* We can skip all ASCII characters at the head except for a
3801              few control codes.  */
3802           while (begp < endp && (c = *begp) < 0x80
3803                  && c != ISO_CODE_CR && c != ISO_CODE_SO
3804                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
3805                  && (!eol_conversion || c != ISO_CODE_LF))
3806             begp++;
3807         }
3808       switch (coding->category_idx)
3809         {
3810         case CODING_CATEGORY_IDX_ISO_8_1:
3811         case CODING_CATEGORY_IDX_ISO_8_2:
3812           /* We can skip all ASCII characters at the tail.  */
3813           if (eol_conversion)
3814             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\n') endp--;
3815           else
3816             while (begp < endp && endp[-1] < 0x80) endp--;
3817           break;
3818
3819         case CODING_CATEGORY_IDX_ISO_7:
3820         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
3821           /* We can skip all charactes at the tail except for ESC and
3822              the following 2-byte at the tail.  */
3823           if (eol_conversion)
3824             while (begp < endp
3825                    && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC && c != '\n')
3826               endp--;
3827           else
3828             while (begp < endp
3829                    && (c = endp[-1]) < 0x80 && c != ISO_CODE_ESC)
3830               endp--;
3831           if (begp < endp && endp[-1] == ISO_CODE_ESC)
3832             {
3833               if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
3834                 /* This is an ASCII designation sequence.  We can
3835                     surely skip the tail.  */
3836                 endp += 2;
3837               else
3838                 /* Hmmm, we can't skip the tail.  */
3839                 endp = endp_orig;
3840             }
3841         }
3842     }
3843   *beg += begp - begp_orig;
3844   *end += endp - endp_orig;
3845   return;
3846 }
3847
3848 /* Like shrink_decoding_region but for encoding.  */
3849
3850 static void
3851 shrink_encoding_region (beg, end, coding, str)
3852      int *beg, *end;
3853      struct coding_system *coding;
3854      unsigned char *str;
3855 {
3856   unsigned char *begp_orig, *begp, *endp_orig, *endp;
3857   int eol_conversion;
3858
3859   if (coding->type == coding_type_ccl)
3860     /* We can't skip any data.  */
3861     return;
3862   else if (coding->type == coding_type_no_conversion)
3863     {
3864       /* We need no conversion.  */
3865       *beg = *end;
3866       return;
3867     }
3868
3869   if (str)
3870     {
3871       begp_orig = begp = str + *beg;
3872       endp_orig = endp = str + *end;
3873     }
3874   else
3875     {
3876       begp_orig = begp = BYTE_POS_ADDR (*beg);
3877       endp_orig = endp = begp + *end - *beg;
3878     }
3879
3880   eol_conversion = (coding->eol_type == CODING_EOL_CR
3881                     || coding->eol_type == CODING_EOL_CRLF);
3882
3883   /* Here, we don't have to check coding->pre_write_conversion because
3884      the caller is expected to have handled it already.  */
3885   switch (coding->type)
3886     {
3887     case coding_type_undecided:
3888     case coding_type_emacs_mule:
3889     case coding_type_raw_text:
3890       if (eol_conversion)
3891         {
3892           while (begp < endp && *begp != '\n') begp++;
3893           while (begp < endp && endp[-1] != '\n') endp--;
3894         }
3895       else
3896         begp = endp;
3897       break;
3898
3899     case coding_type_iso2022:
3900       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3901         {
3902           unsigned char *bol = begp;
3903           while (begp < endp && *begp < 0x80)
3904             {
3905               begp++;
3906               if (begp[-1] == '\n')
3907                 bol = begp;
3908             }
3909           begp = bol;
3910           goto label_skip_tail;
3911         }
3912       /* fall down ... */
3913
3914     default:
3915       /* We can skip all ASCII characters at the head and tail.  */
3916       if (eol_conversion)
3917         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
3918       else
3919         while (begp < endp && *begp < 0x80) begp++;
3920     label_skip_tail:
3921       if (eol_conversion)
3922         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
3923       else
3924         while (begp < endp && *(endp - 1) < 0x80) endp--;
3925       break;
3926     }
3927
3928   *beg += begp - begp_orig;
3929   *end += endp - endp_orig;
3930   return;
3931 }
3932
3933 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
3934    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
3935    coding system CODING, and return the status code of code conversion
3936    (currently, this value has no meaning).
3937
3938    How many characters (and bytes) are converted to how many
3939    characters (and bytes) are recorded in members of the structure
3940    CODING.
3941
3942    If ADJUST is nonzero, we do various things as if the original text
3943    is deleted and a new text is inserted.  See the comments in
3944    replace_range (insdel.c) to know what we are doing.
3945
3946    ADJUST nonzero also means that post-read-conversion or
3947    pre-write-conversion functions (if any) should be processed.  */
3948
3949 int
3950 code_convert_region (from, from_byte, to, to_byte, coding, encodep, adjust)
3951      int from, from_byte, to, to_byte, encodep, adjust;
3952      struct coding_system *coding;
3953 {
3954   int len = to - from, len_byte = to_byte - from_byte;
3955   int require, inserted, inserted_byte;
3956   int from_byte_orig, to_byte_orig;
3957   Lisp_Object saved_coding_symbol = Qnil;
3958   int multibyte = !NILP (current_buffer->enable_multibyte_characters);
3959   int first = 1;
3960   int fake_multibyte = 0;
3961   unsigned char *src, *dst;
3962
3963   if (adjust)
3964     {
3965       int saved_from = from;
3966
3967       prepare_to_modify_buffer (from, to, &from);
3968       if (saved_from != from)
3969         {
3970           to = from + len;
3971           if (multibyte)
3972             from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
3973           else
3974             from_byte = from, to_byte = to;
3975           len_byte = to_byte - from_byte;
3976         }
3977     }
3978
3979   if (! encodep && CODING_REQUIRE_DETECTION (coding))
3980     {
3981       /* We must detect encoding of text and eol.  Even if detection
3982          routines can't decide the encoding, we should not let them
3983          undecided because the deeper decoding routine (decode_coding)
3984          tries to detect the encodings in vain in that case.  */
3985
3986       if (from < GPT && to > GPT)
3987         move_gap_both (from, from_byte);
3988       if (coding->type == coding_type_undecided)
3989         {
3990           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
3991           if (coding->type == coding_type_undecided)
3992             coding->type = coding_type_emacs_mule;
3993         }
3994       if (coding->eol_type == CODING_EOL_UNDECIDED)
3995         {
3996           saved_coding_symbol = coding->symbol;
3997           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
3998           if (coding->eol_type == CODING_EOL_UNDECIDED)
3999             coding->eol_type = CODING_EOL_LF;
4000           /* We had better recover the original eol format if we
4001              encounter an inconsitent eol format while decoding.  */
4002           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4003         }
4004     }
4005
4006   coding->consumed_char = len, coding->consumed = len_byte;
4007
4008   if (encodep
4009       ? ! CODING_REQUIRE_ENCODING (coding)
4010       : ! CODING_REQUIRE_DECODING (coding))
4011     {
4012       coding->produced = len_byte;
4013       if (multibyte)
4014         {
4015           if (GPT < from || GPT > to)
4016             move_gap_both (from, from_byte);
4017           coding->produced_char
4018             = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4019           if (coding->produced_char != len)
4020             {
4021               int diff = coding->produced_char - len;
4022
4023               if (adjust)
4024                 adjust_before_replace (from, from_byte, to, to_byte);
4025               ZV += diff; Z += diff; GPT += diff;
4026               if (adjust)
4027                 adjust_after_replace (from, from_byte, to, to_byte,
4028                                       diff, 0);
4029             }
4030         }
4031       else
4032         coding->produced_char = len_byte;
4033       return 0;
4034     }
4035
4036   /* Now we convert the text.  */
4037
4038   /* For encoding, we must process pre-write-conversion in advance.  */
4039   if (encodep
4040       && adjust
4041       && ! NILP (coding->pre_write_conversion)
4042       && SYMBOLP (coding->pre_write_conversion)
4043       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4044     {
4045       /* The function in pre-write-conversion put a new text in a new
4046          buffer.  */
4047       struct buffer *prev = current_buffer, *new;
4048
4049       call2 (coding->pre_write_conversion, from, to);
4050       if (current_buffer != prev)
4051         {
4052           len = ZV - BEGV;
4053           new = current_buffer;
4054           set_buffer_internal_1 (prev);
4055           del_range_2 (from, to, from_byte, to_byte);
4056           insert_from_buffer (new, BEG, len, 0);
4057           to = from + len;
4058           to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4059           len_byte = to_byte - from_byte;
4060         }
4061     }
4062
4063   /* Try to skip the heading and tailing ASCIIs.  */
4064   from_byte_orig = from_byte; to_byte_orig = to_byte;
4065   if (from < GPT && GPT < to)
4066     move_gap (from);
4067   if (encodep)
4068     shrink_encoding_region (&from_byte, &to_byte, coding, NULL);
4069   else
4070     shrink_decoding_region (&from_byte, &to_byte, coding, NULL);
4071   if (from_byte == to_byte)
4072     {
4073       coding->produced = len_byte;
4074       coding->produced_char = multibyte ? len : len_byte;
4075       return 0;
4076     }
4077
4078   /* Here, the excluded region by shrinking contains only ASCIIs.  */
4079   from += (from_byte - from_byte_orig);
4080   to += (to_byte - to_byte_orig);
4081   len = to - from;
4082   len_byte = to_byte - from_byte;
4083
4084   /* For converion, we must put the gap before the text in addition to
4085      making the gap larger for efficient decoding.  The required gap
4086      size starts from 2000 which is the magic number used in make_gap.
4087      But, after one batch of conversion, it will be incremented if we
4088      find that it is not enough .  */
4089   require = 2000;
4090
4091   if (GAP_SIZE  < require)
4092     make_gap (require - GAP_SIZE);
4093   move_gap_both (from, from_byte);
4094
4095   if (adjust)
4096     adjust_before_replace (from, from_byte, to, to_byte);
4097
4098   if (GPT - BEG < beg_unchanged)
4099     beg_unchanged = GPT - BEG;
4100   if (Z - GPT < end_unchanged)
4101     end_unchanged = Z - GPT;
4102
4103   inserted = inserted_byte = 0;
4104   src = GAP_END_ADDR, dst = GPT_ADDR;
4105
4106   GAP_SIZE += len_byte;
4107   ZV -= len;
4108   Z -= len;
4109   ZV_BYTE -= len_byte;
4110   Z_BYTE -= len_byte;
4111
4112   for (;;)
4113     {
4114       int result;
4115
4116       /* The buffer memory is changed from:
4117          +--------+converted-text+---------+-------original-text------+---+
4118          |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4119                   |<------------------- GAP_SIZE -------------------->|  */
4120       if (encodep)
4121         result = encode_coding (coding, src, dst, len_byte, 0);
4122       else
4123         result = decode_coding (coding, src, dst, len_byte, 0);
4124       /* to:
4125          +--------+-------converted-text--------+--+---original-text--+---+
4126          |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4127                   |<------------------- GAP_SIZE -------------------->|  */
4128       if (coding->fake_multibyte)
4129         fake_multibyte = 1;
4130
4131       if (!encodep && !multibyte)
4132         coding->produced_char = coding->produced;
4133       inserted += coding->produced_char;
4134       inserted_byte += coding->produced;
4135       len_byte -= coding->consumed;
4136       src += coding->consumed;
4137       dst += inserted_byte;
4138
4139       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4140         {
4141           unsigned char *pend = dst, *p = pend - inserted_byte;
4142
4143           /* Encode LFs back to the original eol format (CR or CRLF).  */
4144           if (coding->eol_type == CODING_EOL_CR)
4145             {
4146               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4147             }
4148           else
4149             {
4150               int count = 0;
4151
4152               while (p < pend) if (*p++ == '\n') count++;
4153               if (src - dst < count)
4154                 {
4155                   /* We don't have sufficient room for putting LFs
4156                      back to CRLF.  We must record converted and
4157                      not-yet-converted text back to the buffer
4158                      content, enlarge the gap, then record them out of
4159                      the buffer contents again.  */
4160                   int add = len_byte + inserted_byte;
4161
4162                   GAP_SIZE -= add;
4163                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4164                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4165                   make_gap (count - GAP_SIZE);
4166                   GAP_SIZE += add;
4167                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4168                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4169                   /* Don't forget to update SRC, DST, and PEND.  */
4170                   src = GAP_END_ADDR - len_byte;
4171                   dst = GPT_ADDR + inserted_byte;
4172                   pend = dst;
4173                 }
4174               inserted += count;
4175               inserted_byte += count;
4176               coding->produced += count;
4177               p = dst = pend + count;
4178               while (count)
4179                 {
4180                   *--p = *--pend;
4181                   if (*p == '\n') count--, *--p = '\r';
4182                 }
4183             }
4184
4185           /* Suppress eol-format conversion in the further conversion.  */
4186           coding->eol_type = CODING_EOL_LF;
4187
4188           /* Restore the original symbol.  */
4189           coding->symbol = saved_coding_symbol;
4190
4191           continue;
4192         }
4193       if (len_byte <= 0)
4194         break;
4195       if (result == CODING_FINISH_INSUFFICIENT_SRC)
4196         {
4197           /* The source text ends in invalid codes.  Let's just
4198              make them valid buffer contents, and finish conversion.  */
4199           inserted += len_byte;
4200           inserted_byte += len_byte;
4201           while (len_byte--)
4202             *src++ = *dst++;
4203           fake_multibyte = 1;
4204           break;
4205         }
4206       if (first)
4207         {
4208           /* We have just done the first batch of conversion which was
4209              stoped because of insufficient gap.  Let's reconsider the
4210              required gap size (i.e. SRT - DST) now.
4211
4212              We have converted ORIG bytes (== coding->consumed) into
4213              NEW bytes (coding->produced).  To convert the remaining
4214              LEN bytes, we may need REQUIRE bytes of gap, where:
4215                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4216                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4217              Here, we are sure that NEW >= ORIG.  */
4218           require = (len_byte * (coding->produced - coding->consumed)
4219                      / coding->consumed);
4220           first = 0;
4221         }
4222       if ((src - dst) < (require + 2000))
4223         {
4224           /* See the comment above the previous call of make_gap.  */
4225           int add = len_byte + inserted_byte;
4226
4227           GAP_SIZE -= add;
4228           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4229           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4230           make_gap (require + 2000);
4231           GAP_SIZE += add;
4232           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4233           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4234           /* Don't forget to update SRC, DST.  */
4235           src = GAP_END_ADDR - len_byte;
4236           dst = GPT_ADDR + inserted_byte;
4237         }
4238     }
4239   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
4240
4241   if (multibyte && (fake_multibyte || !encodep && (to - from) != (to_byte - from_byte)))
4242     inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
4243
4244   /* Update various buffer positions for the new text.  */
4245   GAP_SIZE -= inserted_byte;
4246   ZV += inserted; Z+= inserted;
4247   ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
4248   GPT += inserted; GPT_BYTE += inserted_byte;
4249
4250   if (adjust)
4251     {
4252       adjust_after_replace (from, from_byte, to, to_byte,
4253                             inserted, inserted_byte);
4254
4255       if (! encodep && ! NILP (coding->post_read_conversion))
4256         {
4257           Lisp_Object val;
4258           int orig_inserted = inserted, pos = PT;
4259
4260           temp_set_point_both (current_buffer, from, from_byte);
4261           val = call1 (coding->post_read_conversion, make_number (inserted));
4262           if (! NILP (val))
4263             {
4264               CHECK_NUMBER (val, 0);
4265               inserted = XFASTINT (val);
4266             }
4267           if (pos >= from + orig_inserted)
4268             temp_set_point (current_buffer, pos + (inserted - orig_inserted));
4269         }
4270       signal_after_change (from, to - from, inserted);
4271     }
4272
4273   {
4274     int skip = (to_byte_orig - to_byte) + (from_byte - from_byte_orig);
4275
4276     coding->consumed = to_byte_orig - from_byte_orig;
4277     coding->consumed_char = skip + (to - from);
4278     coding->produced = skip + inserted_byte;
4279     coding->produced_char = skip + inserted;
4280   }
4281   return 0;
4282 }
4283
4284 Lisp_Object
4285 code_convert_string (str, coding, encodep, nocopy)
4286      Lisp_Object str;
4287      struct coding_system *coding;
4288      int encodep, nocopy;
4289 {
4290   int len;
4291   char *buf;
4292   int from = 0, to = XSTRING (str)->size, to_byte = XSTRING (str)->size_byte;
4293   struct gcpro gcpro1;
4294   Lisp_Object saved_coding_symbol = Qnil;
4295   int result;
4296
4297   if (encodep && !NILP (coding->pre_write_conversion)
4298       || !encodep && !NILP (coding->post_read_conversion))
4299     {
4300       /* Since we have to call Lisp functions which assume target text
4301          is in a buffer, after setting a temporary buffer, call
4302          code_convert_region.  */
4303       int count = specpdl_ptr - specpdl;
4304       struct buffer *prev = current_buffer;
4305
4306       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4307       temp_output_buffer_setup (" *code-converting-work*");
4308       set_buffer_internal (XBUFFER (Vstandard_output));
4309       if (encodep)
4310         insert_from_string (str, 0, 0, to, to_byte, 0);
4311       else
4312         {
4313           /* We must insert the contents of STR as is without
4314              unibyte<->multibyte conversion.  */
4315           current_buffer->enable_multibyte_characters = Qnil;
4316           insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4317           current_buffer->enable_multibyte_characters = Qt;
4318         }
4319       code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
4320       if (encodep)
4321         /* We must return the buffer contents as unibyte string.  */
4322         current_buffer->enable_multibyte_characters = Qnil;
4323       str = make_buffer_string (BEGV, ZV, 0);
4324       set_buffer_internal (prev);
4325       return unbind_to (count, str);
4326     }
4327
4328   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4329     {
4330       /* See the comments in code_convert_region.  */
4331       if (coding->type == coding_type_undecided)
4332         {
4333           detect_coding (coding, XSTRING (str)->data, to_byte);
4334           if (coding->type == coding_type_undecided)
4335             coding->type = coding_type_emacs_mule;
4336         }
4337       if (coding->eol_type == CODING_EOL_UNDECIDED)
4338         {
4339           saved_coding_symbol = coding->symbol;
4340           detect_eol (coding, XSTRING (str)->data, to_byte);
4341           if (coding->eol_type == CODING_EOL_UNDECIDED)
4342             coding->eol_type = CODING_EOL_LF;
4343           /* We had better recover the original eol format if we
4344              encounter an inconsitent eol format while decoding.  */
4345           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4346         }
4347     }
4348
4349   if (encodep
4350       ? ! CODING_REQUIRE_ENCODING (coding)
4351       : ! CODING_REQUIRE_DECODING (coding))
4352     from = to_byte;
4353   else
4354     {
4355       /* Try to skip the heading and tailing ASCIIs.  */
4356       if (encodep)
4357         shrink_encoding_region (&from, &to_byte, coding, XSTRING (str)->data);
4358       else
4359         shrink_decoding_region (&from, &to_byte, coding, XSTRING (str)->data);
4360     }
4361   if (from == to_byte)
4362     return (nocopy ? str : Fcopy_sequence (str));
4363
4364   if (encodep)
4365     len = encoding_buffer_size (coding, to_byte - from);
4366   else
4367     len = decoding_buffer_size (coding, to_byte - from);
4368   len += from + XSTRING (str)->size_byte - to_byte;
4369   GCPRO1 (str);
4370   buf = get_conversion_buffer (len);
4371   UNGCPRO;
4372
4373   if (from > 0)
4374     bcopy (XSTRING (str)->data, buf, from);
4375   result = (encodep
4376             ? encode_coding (coding, XSTRING (str)->data + from,
4377                              buf + from, to_byte - from, len)
4378             : decode_coding (coding, XSTRING (str)->data + from,
4379                              buf + from, to - from, len));
4380   if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4381     {
4382       /* We simple try to decode the whole string again but without
4383          eol-conversion this time.  */
4384       coding->eol_type = CODING_EOL_LF;
4385       coding->symbol = saved_coding_symbol;
4386       return code_convert_string (str, coding, encodep, nocopy);
4387     }
4388
4389   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
4390          XSTRING (str)->size_byte - to_byte);
4391
4392   len = from + XSTRING (str)->size_byte - to_byte;
4393   if (encodep)
4394     str = make_unibyte_string (buf, len + coding->produced);
4395   else
4396     str = make_multibyte_string (buf, len + coding->produced_char,
4397                                  len + coding->produced);
4398   return str;
4399 }
4400
4401 \f
4402 #ifdef emacs
4403 /*** 7. Emacs Lisp library functions ***/
4404
4405 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4406   "Return t if OBJECT is nil or a coding-system.\n\
4407 See the documentation of `make-coding-system' for information\n\
4408 about coding-system objects.")
4409   (obj)
4410      Lisp_Object obj;
4411 {
4412   if (NILP (obj))
4413     return Qt;
4414   if (!SYMBOLP (obj))
4415     return Qnil;
4416   /* Get coding-spec vector for OBJ.  */
4417   obj = Fget (obj, Qcoding_system);
4418   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4419           ? Qt : Qnil);
4420 }
4421
4422 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4423        Sread_non_nil_coding_system, 1, 1, 0,
4424   "Read a coding system from the minibuffer, prompting with string PROMPT.")
4425   (prompt)
4426      Lisp_Object prompt;
4427 {
4428   Lisp_Object val;
4429   do
4430     {
4431       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4432                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
4433     }
4434   while (XSTRING (val)->size == 0);
4435   return (Fintern (val, Qnil));
4436 }
4437
4438 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4439   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4440 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4441   (prompt, default_coding_system)
4442      Lisp_Object prompt, default_coding_system;
4443 {
4444   Lisp_Object val;
4445   if (SYMBOLP (default_coding_system))
4446     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4447   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4448                           Qt, Qnil, Qcoding_system_history,
4449                           default_coding_system, Qnil);
4450   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4451 }
4452
4453 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4454        1, 1, 0,
4455   "Check validity of CODING-SYSTEM.\n\
4456 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4457 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4458 The value of property should be a vector of length 5.")
4459   (coding_system)
4460      Lisp_Object coding_system;
4461 {
4462   CHECK_SYMBOL (coding_system, 0);
4463   if (!NILP (Fcoding_system_p (coding_system)))
4464     return coding_system;
4465   while (1)
4466     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4467 }
4468 \f
4469 Lisp_Object
4470 detect_coding_system (src, src_bytes, highest)
4471      unsigned char *src;
4472      int src_bytes, highest;
4473 {
4474   int coding_mask, eol_type;
4475   Lisp_Object val, tmp;
4476   int dummy;
4477
4478   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4479   eol_type  = detect_eol_type (src, src_bytes, &dummy);
4480   if (eol_type == CODING_EOL_INCONSISTENT)
4481     eol_type == CODING_EOL_UNDECIDED;
4482
4483   if (!coding_mask)
4484     {
4485       val = Qundecided;
4486       if (eol_type != CODING_EOL_UNDECIDED)
4487         {
4488           Lisp_Object val2;
4489           val2 = Fget (Qundecided, Qeol_type);
4490           if (VECTORP (val2))
4491             val = XVECTOR (val2)->contents[eol_type];
4492         }
4493       return val;
4494     }
4495
4496   /* At first, gather possible coding systems in VAL.  */
4497   val = Qnil;
4498   for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4499     {
4500       int idx
4501         = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4502       if (coding_mask & (1 << idx))
4503         {
4504           val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4505           if (highest)
4506             break;
4507         }
4508     }
4509   if (!highest)
4510     val = Fnreverse (val);
4511
4512   /* Then, substitute the elements by subsidiary coding systems.  */
4513   for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4514     {
4515       if (eol_type != CODING_EOL_UNDECIDED)
4516         {
4517           Lisp_Object eol;
4518           eol = Fget (XCONS (tmp)->car, Qeol_type);
4519           if (VECTORP (eol))
4520             XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4521         }
4522     }
4523   return (highest ? XCONS (val)->car : val);
4524 }
4525
4526 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4527        2, 3, 0,
4528   "Detect coding system of the text in the region between START and END.\n\
4529 Return a list of possible coding systems ordered by priority.\n\
4530 \n\
4531 If only ASCII characters are found, it returns `undecided'\n\
4532 or its subsidiary coding system according to a detected end-of-line format.\n\
4533 \n\
4534 If optional argument HIGHEST is non-nil, return the coding system of\n\
4535 highest priority.")
4536   (start, end, highest)
4537      Lisp_Object start, end, highest;
4538 {
4539   int from, to;
4540   int from_byte, to_byte;
4541
4542   CHECK_NUMBER_COERCE_MARKER (start, 0);
4543   CHECK_NUMBER_COERCE_MARKER (end, 1);
4544
4545   validate_region (&start, &end);
4546   from = XINT (start), to = XINT (end);
4547   from_byte = CHAR_TO_BYTE (from);
4548   to_byte = CHAR_TO_BYTE (to);
4549
4550   if (from < GPT && to >= GPT)
4551     move_gap_both (to, to_byte);
4552
4553   return detect_coding_system (BYTE_POS_ADDR (from_byte),
4554                                to_byte - from_byte,
4555                                !NILP (highest));
4556 }
4557
4558 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4559        1, 2, 0,
4560   "Detect coding system of the text in STRING.\n\
4561 Return a list of possible coding systems ordered by priority.\n\
4562 \n\
4563 If only ASCII characters are found, it returns `undecided'\n\
4564 or its subsidiary coding system according to a detected end-of-line format.\n\
4565 \n\
4566 If optional argument HIGHEST is non-nil, return the coding system of\n\
4567 highest priority.")
4568   (string, highest)
4569      Lisp_Object string, highest;
4570 {
4571   CHECK_STRING (string, 0);
4572
4573   return detect_coding_system (XSTRING (string)->data,
4574                                XSTRING (string)->size_byte,
4575                                !NILP (highest));
4576 }
4577
4578 Lisp_Object
4579 code_convert_region1 (start, end, coding_system, encodep)
4580      Lisp_Object start, end, coding_system;
4581      int encodep;
4582 {
4583   struct coding_system coding;
4584   int from, to, len;
4585
4586   CHECK_NUMBER_COERCE_MARKER (start, 0);
4587   CHECK_NUMBER_COERCE_MARKER (end, 1);
4588   CHECK_SYMBOL (coding_system, 2);
4589
4590   validate_region (&start, &end);
4591   from = XFASTINT (start);
4592   to = XFASTINT (end);
4593
4594   if (NILP (coding_system))
4595     return make_number (to - from);
4596
4597   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4598     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4599
4600   coding.mode |= CODING_MODE_LAST_BLOCK;
4601   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4602                        &coding, encodep, 1);
4603   return make_number (coding.produced_char);
4604 }
4605
4606 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4607        3, 3, "r\nzCoding system: ",
4608   "Decode the current region by specified coding system.\n\
4609 When called from a program, takes three arguments:\n\
4610 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
4611 Return length of decoded text.")
4612   (start, end, coding_system)
4613      Lisp_Object start, end, coding_system;
4614 {
4615   return code_convert_region1 (start, end, coding_system, 0);
4616 }
4617
4618 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4619        3, 3, "r\nzCoding system: ",
4620   "Encode the current region by specified coding system.\n\
4621 When called from a program, takes three arguments:\n\
4622 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
4623 Return length of encoded text.")
4624   (start, end, coding_system)
4625      Lisp_Object start, end, coding_system;
4626 {
4627   return code_convert_region1 (start, end, coding_system, 1);
4628 }
4629
4630 Lisp_Object
4631 code_convert_string1 (string, coding_system, nocopy, encodep)
4632      Lisp_Object string, coding_system, nocopy;
4633      int encodep;
4634 {
4635   struct coding_system coding;
4636
4637   CHECK_STRING (string, 0);
4638   CHECK_SYMBOL (coding_system, 1);
4639
4640   if (NILP (coding_system))
4641     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4642
4643   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4644     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4645
4646   coding.mode |= CODING_MODE_LAST_BLOCK;
4647   return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4648 }
4649
4650 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
4651        2, 3, 0,
4652   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4653 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4654 if the decoding operation is trivial.")
4655   (string, coding_system, nocopy)
4656      Lisp_Object string, coding_system, nocopy;
4657 {
4658   return code_convert_string1(string, coding_system, nocopy, 0);
4659 }
4660
4661 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
4662        2, 3, 0,
4663   "Encode STRING to CODING-SYSTEM, and return the result.\n\
4664 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4665 if the encoding operation is trivial.")
4666   (string, coding_system, nocopy)
4667      Lisp_Object string, coding_system, nocopy;
4668 {
4669   return code_convert_string1(string, coding_system, nocopy, 1);
4670 }
4671
4672 \f
4673 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
4674   "Decode a JISX0208 character of shift-jis encoding.\n\
4675 CODE is the character code in SJIS.\n\
4676 Return the corresponding character.")
4677   (code)
4678      Lisp_Object code;
4679 {
4680   unsigned char c1, c2, s1, s2;
4681   Lisp_Object val;
4682
4683   CHECK_NUMBER (code, 0);
4684   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
4685   DECODE_SJIS (s1, s2, c1, c2);
4686   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
4687   return val;
4688 }
4689
4690 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
4691   "Encode a JISX0208 character CHAR to SJIS coding system.\n\
4692 Return the corresponding character code in SJIS.")
4693   (ch)
4694      Lisp_Object ch;
4695 {
4696   int charset, c1, c2, s1, s2;
4697   Lisp_Object val;
4698
4699   CHECK_NUMBER (ch, 0);
4700   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
4701   if (charset == charset_jisx0208)
4702     {
4703       ENCODE_SJIS (c1, c2, s1, s2);
4704       XSETFASTINT (val, (s1 << 8) | s2);
4705     }
4706   else
4707     XSETFASTINT (val, 0);
4708   return val;
4709 }
4710
4711 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
4712   "Decode a Big5 character CODE of BIG5 coding system.\n\
4713 CODE is the character code in BIG5.\n\
4714 Return the corresponding character.")
4715   (code)
4716      Lisp_Object code;
4717 {
4718   int charset;
4719   unsigned char b1, b2, c1, c2;
4720   Lisp_Object val;
4721
4722   CHECK_NUMBER (code, 0);
4723   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
4724   DECODE_BIG5 (b1, b2, charset, c1, c2);
4725   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
4726   return val;
4727 }
4728
4729 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
4730   "Encode the Big5 character CHAR to BIG5 coding system.\n\
4731 Return the corresponding character code in Big5.")
4732   (ch)
4733      Lisp_Object ch;
4734 {
4735   int charset, c1, c2, b1, b2;
4736   Lisp_Object val;
4737
4738   CHECK_NUMBER (ch, 0);
4739   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
4740   if (charset == charset_big5_1 || charset == charset_big5_2)
4741     {
4742       ENCODE_BIG5 (charset, c1, c2, b1, b2);
4743       XSETFASTINT (val, (b1 << 8) | b2);
4744     }
4745   else
4746     XSETFASTINT (val, 0);
4747   return val;
4748 }
4749 \f
4750 DEFUN ("set-terminal-coding-system-internal",
4751        Fset_terminal_coding_system_internal,
4752        Sset_terminal_coding_system_internal, 1, 1, 0, "")
4753   (coding_system)
4754      Lisp_Object coding_system;
4755 {
4756   CHECK_SYMBOL (coding_system, 0);
4757   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4758   /* We had better not send unsafe characters to terminal.  */
4759   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
4760
4761   return Qnil;
4762 }
4763
4764 DEFUN ("set-safe-terminal-coding-system-internal",
4765        Fset_safe_terminal_coding_system_internal,
4766        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
4767   (coding_system)
4768      Lisp_Object coding_system;
4769 {
4770   CHECK_SYMBOL (coding_system, 0);
4771   setup_coding_system (Fcheck_coding_system (coding_system),
4772                        &safe_terminal_coding);
4773   return Qnil;
4774 }
4775
4776 DEFUN ("terminal-coding-system",
4777        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
4778   "Return coding system specified for terminal output.")
4779   ()
4780 {
4781   return terminal_coding.symbol;
4782 }
4783
4784 DEFUN ("set-keyboard-coding-system-internal",
4785        Fset_keyboard_coding_system_internal,
4786        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4787   (coding_system)
4788      Lisp_Object coding_system;
4789 {
4790   CHECK_SYMBOL (coding_system, 0);
4791   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
4792   return Qnil;
4793 }
4794
4795 DEFUN ("keyboard-coding-system",
4796        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
4797   "Return coding system specified for decoding keyboard input.")
4798   ()
4799 {
4800   return keyboard_coding.symbol;
4801 }
4802
4803 \f
4804 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
4805        Sfind_operation_coding_system,  1, MANY, 0,
4806   "Choose a coding system for an operation based on the target name.\n\
4807 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
4808 DECODING-SYSTEM is the coding system to use for decoding\n\
4809 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
4810 for encoding (in case OPERATION does encoding).\n\
4811 \n\
4812 The first argument OPERATION specifies an I/O primitive:\n\
4813   For file I/O, `insert-file-contents' or `write-region'.\n\
4814   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
4815   For network I/O, `open-network-stream'.\n\
4816 \n\
4817 The remaining arguments should be the same arguments that were passed\n\
4818 to the primitive.  Depending on which primitive, one of those arguments\n\
4819 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
4820 whichever argument specifies the file name is TARGET.\n\
4821 \n\
4822 TARGET has a meaning which depends on OPERATION:\n\
4823   For file I/O, TARGET is a file name.\n\
4824   For process I/O, TARGET is a process name.\n\
4825   For network I/O, TARGET is a service name or a port number\n\
4826 \n\
4827 This function looks up what specified for TARGET in,\n\
4828 `file-coding-system-alist', `process-coding-system-alist',\n\
4829 or `network-coding-system-alist' depending on OPERATION.\n\
4830 They may specify a coding system, a cons of coding systems,\n\
4831 or a function symbol to call.\n\
4832 In the last case, we call the function with one argument,\n\
4833 which is a list of all the arguments given to this function.")
4834   (nargs, args)
4835      int nargs;
4836      Lisp_Object *args;
4837 {
4838   Lisp_Object operation, target_idx, target, val;
4839   register Lisp_Object chain;
4840
4841   if (nargs < 2)
4842     error ("Too few arguments");
4843   operation = args[0];
4844   if (!SYMBOLP (operation)
4845       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
4846     error ("Invalid first arguement");
4847   if (nargs < 1 + XINT (target_idx))
4848     error ("Too few arguments for operation: %s",
4849            XSYMBOL (operation)->name->data);
4850   target = args[XINT (target_idx) + 1];
4851   if (!(STRINGP (target)
4852         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
4853     error ("Invalid %dth argument", XINT (target_idx) + 1);
4854
4855   chain = ((EQ (operation, Qinsert_file_contents)
4856             || EQ (operation, Qwrite_region))
4857            ? Vfile_coding_system_alist
4858            : (EQ (operation, Qopen_network_stream)
4859               ? Vnetwork_coding_system_alist
4860               : Vprocess_coding_system_alist));
4861   if (NILP (chain))
4862     return Qnil;
4863
4864   for (; CONSP (chain); chain = XCONS (chain)->cdr)
4865     {
4866       Lisp_Object elt;
4867       elt = XCONS (chain)->car;
4868
4869       if (CONSP (elt)
4870           && ((STRINGP (target)
4871                && STRINGP (XCONS (elt)->car)
4872                && fast_string_match (XCONS (elt)->car, target) >= 0)
4873               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
4874         {
4875           val = XCONS (elt)->cdr;
4876           /* Here, if VAL is both a valid coding system and a valid
4877              function symbol, we return VAL as a coding system.  */
4878           if (CONSP (val))
4879             return val;
4880           if (! SYMBOLP (val))
4881             return Qnil;
4882           if (! NILP (Fcoding_system_p (val)))
4883             return Fcons (val, val);
4884           if (! NILP (Ffboundp (val)))
4885             {
4886               val = call1 (val, Flist (nargs, args));
4887               if (CONSP (val))
4888                 return val;
4889               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
4890                 return Fcons (val, val);
4891             }
4892           return Qnil;
4893         }
4894     }
4895   return Qnil;
4896 }
4897
4898 DEFUN ("update-iso-coding-systems", Fupdate_iso_coding_systems,
4899        Supdate_iso_coding_systems, 0, 0, 0,
4900   "Update internal database for ISO2022 based coding systems.\n\
4901 When values of the following coding categories are changed, you must\n\
4902 call this function:\n\
4903   coding-category-iso-7, coding-category-iso-7-tight,\n\
4904   coding-category-iso-8-1, coding-category-iso-8-2,\n\
4905   coding-category-iso-7-else, coding-category-iso-8-else")
4906   ()
4907 {
4908   int i;
4909
4910   for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_ISO_8_ELSE;
4911        i++)
4912     {
4913       if (! coding_system_table[i])
4914         coding_system_table[i]
4915           = (struct coding_system *) xmalloc (sizeof (struct coding_system));
4916       setup_coding_system
4917         (XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value,
4918          coding_system_table[i]);
4919     }
4920   return Qnil;
4921 }
4922
4923 #endif /* emacs */
4924
4925 \f
4926 /*** 8. Post-amble ***/
4927
4928 init_coding_once ()
4929 {
4930   int i;
4931
4932   /* Emacs' internal format specific initialize routine.  */
4933   for (i = 0; i <= 0x20; i++)
4934     emacs_code_class[i] = EMACS_control_code;
4935   emacs_code_class[0x0A] = EMACS_linefeed_code;
4936   emacs_code_class[0x0D] = EMACS_carriage_return_code;
4937   for (i = 0x21 ; i < 0x7F; i++)
4938     emacs_code_class[i] = EMACS_ascii_code;
4939   emacs_code_class[0x7F] = EMACS_control_code;
4940   emacs_code_class[0x80] = EMACS_leading_code_composition;
4941   for (i = 0x81; i < 0xFF; i++)
4942     emacs_code_class[i] = EMACS_invalid_code;
4943   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
4944   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
4945   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
4946   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
4947
4948   /* ISO2022 specific initialize routine.  */
4949   for (i = 0; i < 0x20; i++)
4950     iso_code_class[i] = ISO_control_code;
4951   for (i = 0x21; i < 0x7F; i++)
4952     iso_code_class[i] = ISO_graphic_plane_0;
4953   for (i = 0x80; i < 0xA0; i++)
4954     iso_code_class[i] = ISO_control_code;
4955   for (i = 0xA1; i < 0xFF; i++)
4956     iso_code_class[i] = ISO_graphic_plane_1;
4957   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
4958   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4959   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
4960   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
4961   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
4962   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
4963   iso_code_class[ISO_CODE_ESC] = ISO_escape;
4964   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
4965   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
4966   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
4967
4968   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
4969   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
4970
4971   setup_coding_system (Qnil, &keyboard_coding);
4972   setup_coding_system (Qnil, &terminal_coding);
4973   setup_coding_system (Qnil, &safe_terminal_coding);
4974
4975   bzero (coding_system_table, sizeof coding_system_table);
4976
4977 #if defined (MSDOS) || defined (WINDOWSNT)
4978   system_eol_type = CODING_EOL_CRLF;
4979 #else
4980   system_eol_type = CODING_EOL_LF;
4981 #endif
4982 }
4983
4984 #ifdef emacs
4985
4986 syms_of_coding ()
4987 {
4988   Qtarget_idx = intern ("target-idx");
4989   staticpro (&Qtarget_idx);
4990
4991   Qcoding_system_history = intern ("coding-system-history");
4992   staticpro (&Qcoding_system_history);
4993   Fset (Qcoding_system_history, Qnil);
4994
4995   /* Target FILENAME is the first argument.  */
4996   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
4997   /* Target FILENAME is the third argument.  */
4998   Fput (Qwrite_region, Qtarget_idx, make_number (2));
4999
5000   Qcall_process = intern ("call-process");
5001   staticpro (&Qcall_process);
5002   /* Target PROGRAM is the first argument.  */
5003   Fput (Qcall_process, Qtarget_idx, make_number (0));
5004
5005   Qcall_process_region = intern ("call-process-region");
5006   staticpro (&Qcall_process_region);
5007   /* Target PROGRAM is the third argument.  */
5008   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5009
5010   Qstart_process = intern ("start-process");
5011   staticpro (&Qstart_process);
5012   /* Target PROGRAM is the third argument.  */
5013   Fput (Qstart_process, Qtarget_idx, make_number (2));
5014
5015   Qopen_network_stream = intern ("open-network-stream");
5016   staticpro (&Qopen_network_stream);
5017   /* Target SERVICE is the fourth argument.  */
5018   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5019
5020   Qcoding_system = intern ("coding-system");
5021   staticpro (&Qcoding_system);
5022
5023   Qeol_type = intern ("eol-type");
5024   staticpro (&Qeol_type);
5025
5026   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5027   staticpro (&Qbuffer_file_coding_system);
5028
5029   Qpost_read_conversion = intern ("post-read-conversion");
5030   staticpro (&Qpost_read_conversion);
5031
5032   Qpre_write_conversion = intern ("pre-write-conversion");
5033   staticpro (&Qpre_write_conversion);
5034
5035   Qno_conversion = intern ("no-conversion");
5036   staticpro (&Qno_conversion);
5037
5038   Qundecided = intern ("undecided");
5039   staticpro (&Qundecided);
5040
5041   Qcoding_system_p = intern ("coding-system-p");
5042   staticpro (&Qcoding_system_p);
5043
5044   Qcoding_system_error = intern ("coding-system-error");
5045   staticpro (&Qcoding_system_error);
5046
5047   Fput (Qcoding_system_error, Qerror_conditions,
5048         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5049   Fput (Qcoding_system_error, Qerror_message,
5050         build_string ("Invalid coding system"));
5051
5052   Qcoding_category = intern ("coding-category");
5053   staticpro (&Qcoding_category);
5054   Qcoding_category_index = intern ("coding-category-index");
5055   staticpro (&Qcoding_category_index);
5056
5057   Vcoding_category_table
5058     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5059   staticpro (&Vcoding_category_table);
5060   {
5061     int i;
5062     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5063       {
5064         XVECTOR (Vcoding_category_table)->contents[i]
5065           = intern (coding_category_name[i]);
5066         Fput (XVECTOR (Vcoding_category_table)->contents[i],
5067               Qcoding_category_index, make_number (i));
5068       }
5069   }
5070
5071   Qcharacter_unification_table = intern ("character-unification-table");
5072   staticpro (&Qcharacter_unification_table);
5073   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
5074         make_number (0));
5075
5076   Qcharacter_unification_table_for_decode
5077     = intern ("character-unification-table-for-decode");
5078   staticpro (&Qcharacter_unification_table_for_decode);
5079
5080   Qcharacter_unification_table_for_encode
5081     = intern ("character-unification-table-for-encode");
5082   staticpro (&Qcharacter_unification_table_for_encode);
5083
5084   Qsafe_charsets = intern ("safe-charsets");
5085   staticpro (&Qsafe_charsets);
5086
5087   Qemacs_mule = intern ("emacs-mule");
5088   staticpro (&Qemacs_mule);
5089
5090   Qraw_text = intern ("raw-text");
5091   staticpro (&Qraw_text);
5092
5093   defsubr (&Scoding_system_p);
5094   defsubr (&Sread_coding_system);
5095   defsubr (&Sread_non_nil_coding_system);
5096   defsubr (&Scheck_coding_system);
5097   defsubr (&Sdetect_coding_region);
5098   defsubr (&Sdetect_coding_string);
5099   defsubr (&Sdecode_coding_region);
5100   defsubr (&Sencode_coding_region);
5101   defsubr (&Sdecode_coding_string);
5102   defsubr (&Sencode_coding_string);
5103   defsubr (&Sdecode_sjis_char);
5104   defsubr (&Sencode_sjis_char);
5105   defsubr (&Sdecode_big5_char);
5106   defsubr (&Sencode_big5_char);
5107   defsubr (&Sset_terminal_coding_system_internal);
5108   defsubr (&Sset_safe_terminal_coding_system_internal);
5109   defsubr (&Sterminal_coding_system);
5110   defsubr (&Sset_keyboard_coding_system_internal);
5111   defsubr (&Skeyboard_coding_system);
5112   defsubr (&Sfind_operation_coding_system);
5113   defsubr (&Supdate_iso_coding_systems);
5114
5115   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5116     "List of coding systems.\n\
5117 \n\
5118 Do not alter the value of this variable manually.  This variable should be\n\
5119 updated by the functions `make-coding-system' and\n\
5120 `define-coding-system-alias'.");
5121   Vcoding_system_list = Qnil;
5122
5123   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5124     "Alist of coding system names.\n\
5125 Each element is one element list of coding system name.\n\
5126 This variable is given to `completing-read' as TABLE argument.\n\
5127 \n\
5128 Do not alter the value of this variable manually.  This variable should be\n\
5129 updated by the functions `make-coding-system' and\n\
5130 `define-coding-system-alias'.");
5131   Vcoding_system_alist = Qnil;
5132
5133   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5134     "List of coding-categories (symbols) ordered by priority.");
5135   {
5136     int i;
5137
5138     Vcoding_category_list = Qnil;
5139     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5140       Vcoding_category_list
5141         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5142                  Vcoding_category_list);
5143   }
5144
5145   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
5146     "Specify the coding system for read operations.\n\
5147 It is useful to bind this variable with `let', but do not set it globally.\n\
5148 If the value is a coding system, it is used for decoding on read operation.\n\
5149 If not, an appropriate element is used from one of the coding system alists:\n\
5150 There are three such tables, `file-coding-system-alist',\n\
5151 `process-coding-system-alist', and `network-coding-system-alist'.");
5152   Vcoding_system_for_read = Qnil;
5153
5154   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
5155     "Specify the coding system for write operations.\n\
5156 It is useful to bind this variable with `let', but do not set it globally.\n\
5157 If the value is a coding system, it is used for encoding on write operation.\n\
5158 If not, an appropriate element is used from one of the coding system alists:\n\
5159 There are three such tables, `file-coding-system-alist',\n\
5160 `process-coding-system-alist', and `network-coding-system-alist'.");
5161   Vcoding_system_for_write = Qnil;
5162
5163   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
5164     "Coding system used in the latest file or process I/O.");
5165   Vlast_coding_system_used = Qnil;
5166
5167   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
5168     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
5169   inhibit_eol_conversion = 0;
5170
5171   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5172     "Alist to decide a coding system to use for a file I/O operation.\n\
5173 The format is ((PATTERN . VAL) ...),\n\
5174 where PATTERN is a regular expression matching a file name,\n\
5175 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5176 If VAL is a coding system, it is used for both decoding and encoding\n\
5177 the file contents.\n\
5178 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5179 and the cdr part is used for encoding.\n\
5180 If VAL is a function symbol, the function must return a coding system\n\
5181 or a cons of coding systems which are used as above.\n\
5182 \n\
5183 See also the function `find-operation-coding-system'.");
5184   Vfile_coding_system_alist = Qnil;
5185
5186   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5187     "Alist to decide a coding system to use for a process I/O operation.\n\
5188 The format is ((PATTERN . VAL) ...),\n\
5189 where PATTERN is a regular expression matching a program name,\n\
5190 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5191 If VAL is a coding system, it is used for both decoding what received\n\
5192 from the program and encoding what sent to the program.\n\
5193 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5194 and the cdr part is used for encoding.\n\
5195 If VAL is a function symbol, the function must return a coding system\n\
5196 or a cons of coding systems which are used as above.\n\
5197 \n\
5198 See also the function `find-operation-coding-system'.");
5199   Vprocess_coding_system_alist = Qnil;
5200
5201   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5202     "Alist to decide a coding system to use for a network I/O operation.\n\
5203 The format is ((PATTERN . VAL) ...),\n\
5204 where PATTERN is a regular expression matching a network service name\n\
5205 or is a port number to connect to,\n\
5206 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5207 If VAL is a coding system, it is used for both decoding what received\n\
5208 from the network stream and encoding what sent to the network stream.\n\
5209 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5210 and the cdr part is used for encoding.\n\
5211 If VAL is a function symbol, the function must return a coding system\n\
5212 or a cons of coding systems which are used as above.\n\
5213 \n\
5214 See also the function `find-operation-coding-system'.");
5215   Vnetwork_coding_system_alist = Qnil;
5216
5217   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
5218     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
5219   eol_mnemonic_unix = ':';
5220
5221   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
5222     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
5223   eol_mnemonic_dos = '\\';
5224
5225   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
5226     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
5227   eol_mnemonic_mac = '/';
5228
5229   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5230     "Mnemonic character indicating end-of-line format is not yet decided.");
5231   eol_mnemonic_undecided = ':';
5232
5233   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
5234     "Non-nil means ISO 2022 encoder/decoder do character unification.");
5235   Venable_character_unification = Qt;
5236
5237   DEFVAR_LISP ("standard-character-unification-table-for-decode",
5238     &Vstandard_character_unification_table_for_decode,
5239     "Table for unifying characters when reading.");
5240   Vstandard_character_unification_table_for_decode = Qnil;
5241
5242   DEFVAR_LISP ("standard-character-unification-table-for-encode",
5243     &Vstandard_character_unification_table_for_encode,
5244     "Table for unifying characters when writing.");
5245   Vstandard_character_unification_table_for_encode = Qnil;
5246
5247   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5248     "Alist of charsets vs revision numbers.\n\
5249 While encoding, if a charset (car part of an element) is found,\n\
5250 designate it with the escape sequence identifing revision (cdr part of the element).");
5251   Vcharset_revision_alist = Qnil;
5252
5253   DEFVAR_LISP ("default-process-coding-system",
5254                &Vdefault_process_coding_system,
5255     "Cons of coding systems used for process I/O by default.\n\
5256 The car part is used for decoding a process output,\n\
5257 the cdr part is used for encoding a text to be sent to a process.");
5258   Vdefault_process_coding_system = Qnil;
5259
5260   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5261     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5262 This is a vector of length 256.\n\
5263 If Nth element is non-nil, the existence of code N in a file\n\
5264 \(or output of subprocess) doesn't prevent it to be detected as\n\
5265 a coding system of ISO 2022 variant which has a flag\n\
5266 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5267 or reading output of a subprocess.\n\
5268 Only 128th through 159th elements has a meaning.");
5269   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
5270
5271   DEFVAR_LISP ("select-safe-coding-system-function",
5272                &Vselect_safe_coding_system_function,
5273     "Function to call to select safe coding system for encoding a text.\n\
5274 \n\
5275 If set, this function is called to force a user to select a proper\n\
5276 coding system which can encode the text in the case that a default\n\
5277 coding system used in each operation can't encode the text.\n\
5278 \n\
5279 The default value is `select-safe-codign-system' (which see).");
5280   Vselect_safe_coding_system_function = Qnil;
5281
5282 }
5283
5284 #endif /* emacs */