src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. CCL handlers
  29   6. End-of-line handlers
  30   7. C library functions
  31   8. Emacs Lisp library functions
  32   9. Post-amble
  33
  34 */
  35
  36 /*** GENERAL NOTE on CODING SYSTEM ***
  37
  38   Coding system is an encoding mechanism of one or more character
  39   sets.  Here's a list of coding systems which Emacs can handle.  When
  40   we say "decode", it means converting some other coding system to
  41   Emacs' internal format (emacs-internal), and when we say "encode",
  42   it means converting the coding system emacs-mule to some other
  43   coding system.
  44
  45   0. Emacs' internal format (emacs-mule)
  46
  47   Emacs itself holds a multi-lingual character in a buffer and a string
  48   in a special format.  Details are described in section 2.
  49
  50   1. ISO2022
  51
  52   The most famous coding system for multiple character sets.  X's
  53   Compound Text, various EUCs (Extended Unix Code), and coding
  54   systems used in Internet communication such as ISO-2022-JP are
  55   all variants of ISO2022.  Details are described in section 3.
  56
  57   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  58
  59   A coding system to encode character sets: ASCII, JISX0201, and
  60   JISX0208.  Widely used for PC's in Japan.  Details are described in
  61   section 4.
  62
  63   3. BIG5
  64
  65   A coding system to encode character sets: ASCII and Big5.  Widely
  66   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  67   described in section 4.  In this file, when we write "BIG5"
  68   (all uppercase), we mean the coding system, and when we write
  69   "Big5" (capitalized), we mean the character set.
  70
  71   4. Raw text
  72
  73   A coding system for a text containing random 8-bit code.  Emacs does
  74   no code conversion on such a text except for end-of-line format.
  75
  76   5. Other
  77
  78   If a user wants to read/write a text encoded in a coding system not
  79   listed above, he can supply a decoder and an encoder for it in CCL
  80   (Code Conversion Language) programs.  Emacs executes the CCL program
  81   while reading/writing.
  82
  83   Emacs represents a coding system by a Lisp symbol that has a property
  84   `coding-system'.  But, before actually using the coding system, the
  85   information about it is set in a structure of type `struct
  86   coding_system' for rapid processing.  See section 6 for more details.
  87
  88 */
  89
  90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  91
  92   How end-of-line of a text is encoded depends on a system.  For
  93   instance, Unix's format is just one byte of `line-feed' code,
  94   whereas DOS's format is two-byte sequence of `carriage-return' and
  95   `line-feed' codes.  MacOS's format is usually one byte of
  96   `carriage-return'.
  97
  98   Since text characters encoding and end-of-line encoding are
  99   independent, any coding system described above can take
 100   any format of end-of-line.  So, Emacs has information of format of
 101   end-of-line in each coding-system.  See section 6 for more details.
 102
 103 */
 104
 105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 106
 107   These functions check if a text between SRC and SRC_END is encoded
 108   in the coding system category XXX.  Each returns an integer value in
 109   which appropriate flag bits for the category XXX is set.  The flag
 110   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 111   template of these functions.  */
 112 #if 0
 113 int
 114 detect_coding_emacs_mule (src, src_end)
 115      unsigned char *src, *src_end;
 116 {
 117   ...
 118 }
 119 #endif
 120
 121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 122
 123   These functions decode SRC_BYTES length text at SOURCE encoded in
 124   CODING to Emacs' internal format (emacs-mule).  The resulting text
 125   goes to a place pointed to by DESTINATION, the length of which
 126   should not exceed DST_BYTES.  These functions set the information of
 127   original and decoded texts in the members produced, produced_char,
 128   consumed, and consumed_char of the structure *CODING.
 129
 130   The return value is an integer (CODING_FINISH_XXX) indicating how
 131   the decoding finished.
 132
 133   DST_BYTES zero means that source area and destination area are
 134   overlapped, which means that we can produce a decoded text until it
 135   reaches at the head of not-yet-decoded source text.
 136
 137   Below is a template of these functions.  */
 138 #if 0
 139 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 140      struct coding_system *coding;
 141      unsigned char *source, *destination;
 142      int src_bytes, dst_bytes;
 143 {
 144   ...
 145 }
 146 #endif
 147
 148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 149
 150   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 151   internal format (emacs-mule) to CODING.  The resulting text goes to
 152   a place pointed to by DESTINATION, the length of which should not
 153   exceed DST_BYTES.  These functions set the information of
 154   original and encoded texts in the members produced, produced_char,
 155   consumed, and consumed_char of the structure *CODING.
 156
 157   The return value is an integer (CODING_FINISH_XXX) indicating how
 158   the encoding finished.
 159
 160   DST_BYTES zero means that source area and destination area are
 161   overlapped, which means that we can produce a decoded text until it
 162   reaches at the head of not-yet-decoded source text.
 163
 164   Below is a template of these functions.  */
 165 #if 0
 166 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 167      struct coding_system *coding;
 168      unsigned char *source, *destination;
 169      int src_bytes, dst_bytes;
 170 {
 171   ...
 172 }
 173 #endif
 174
 175 /*** COMMONLY USED MACROS ***/
 176
 177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 178    THREE_MORE_BYTES safely get one, two, and three bytes from the
 179    source text respectively.  If there are not enough bytes in the
 180    source, they jump to `label_end_of_loop'.  The caller should set
 181    variables `src' and `src_end' to appropriate areas in advance.  */
 182
 183 #define ONE_MORE_BYTE(c1)       \
 184   do {                          \
 185     if (src < src_end)          \
 186       c1 = *src++;              \
 187     else                        \
 188       goto label_end_of_loop;   \
 189   } while (0)
 190
 191 #define TWO_MORE_BYTES(c1, c2)  \
 192   do {                          \
 193     if (src + 1 < src_end)      \
 194       c1 = *src++, c2 = *src++; \
 195     else                        \
 196       goto label_end_of_loop;   \
 197   } while (0)
 198
 199 #define THREE_MORE_BYTES(c1, c2, c3)            \
 200   do {                                          \
 201     if (src + 2 < src_end)                      \
 202       c1 = *src++, c2 = *src++, c3 = *src++;    \
 203     else                                        \
 204       goto label_end_of_loop;                   \
 205   } while (0)
 206
 207 /* The following three macros DECODE_CHARACTER_ASCII,
 208    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 209    the multi-byte form of a character of each class at the place
 210    pointed by `dst'.  The caller should set the variable `dst' to
 211    point to an appropriate area and the variable `coding' to point to
 212    the coding-system of the currently decoding text in advance.  */
 213
 214 /* Decode one ASCII character C.  */
 215
 216 #define DECODE_CHARACTER_ASCII(c)               \
 217   do {                                          \
 218     if (COMPOSING_P (coding->composing))        \
 219       {                                         \
 220         *dst++ = 0xA0, *dst++ = (c) | 0x80;     \
 221         coding->composed_chars++;               \
 222         if (((c) | 0x80) < 0xA0)                \
 223           coding->fake_multibyte = 1;           \
 224       }                                         \
 225     else                                        \
 226       {                                         \
 227         *dst++ = (c);                           \
 228         coding->produced_char++;                \
 229         if ((c) >= 0x80)                        \
 230           coding->fake_multibyte = 1;           \
 231       }                                         \
 232   } while (0)
 233
 234 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 235    position-code is C.  */
 236
 237 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 238   do {                                                                  \
 239     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 240     if (COMPOSING_P (coding->composing))                                \
 241       {                                                                 \
 242         *dst++ = leading_code + 0x20;                                   \
 243         coding->composed_chars++;                                       \
 244       }                                                                 \
 245     else                                                                \
 246       {                                                                 \
 247         *dst++ = leading_code;                                          \
 248         coding->produced_char++;                                        \
 249       }                                                                 \
 250     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 251       *dst++ = leading_code;                                            \
 252     *dst++ = (c) | 0x80;                                                \
 253     if (((c) | 0x80)  < 0xA0)                                           \
 254       coding->fake_multibyte = 1;                                       \
 255   } while (0)
 256
 257 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 258    position-codes are C1 and C2.  */
 259
 260 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 261   do {                                                  \
 262     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 263     *dst++ = (c2) | 0x80;                               \
 264     if (((c2) | 0x80) < 0xA0)                           \
 265       coding->fake_multibyte = 1;                       \
 266   } while (0)
 267
 268 \f
 269 /*** 1. Preamble ***/
 270
 271 #include <stdio.h>
 272
 273 #ifdef emacs
 274
 275 #include <config.h>
 276 #include "lisp.h"
 277 #include "buffer.h"
 278 #include "charset.h"
 279 #include "ccl.h"
 280 #include "coding.h"
 281 #include "window.h"
 282
 283 #else  /* not emacs */
 284
 285 #include "mulelib.h"
 286
 287 #endif /* not emacs */
 288
 289 Lisp_Object Qcoding_system, Qeol_type;
 290 Lisp_Object Qbuffer_file_coding_system;
 291 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 292 Lisp_Object Qno_conversion, Qundecided;
 293 Lisp_Object Qcoding_system_history;
 294 Lisp_Object Qsafe_charsets;
 295 Lisp_Object Qvalid_codes;
 296
 297 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 298 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 299 Lisp_Object Qstart_process, Qopen_network_stream;
 300 Lisp_Object Qtarget_idx;
 301
 302 Lisp_Object Vselect_safe_coding_system_function;
 303
 304 /* Mnemonic string for each format of end-of-line.  */
 305 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 306 /* Mnemonic string to indicate format of end-of-line is not yet
 307    decided.  */
 308 Lisp_Object eol_mnemonic_undecided;
 309
 310 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 311    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 312 int system_eol_type;
 313
 314 #ifdef emacs
 315
 316 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 317
 318 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 319
 320 /* Coding system emacs-mule and raw-text are for converting only
 321    end-of-line format.  */
 322 Lisp_Object Qemacs_mule, Qraw_text;
 323
 324 /* Coding-systems are handed between Emacs Lisp programs and C internal
 325    routines by the following three variables.  */
 326 /* Coding-system for reading files and receiving data from process.  */
 327 Lisp_Object Vcoding_system_for_read;
 328 /* Coding-system for writing files and sending data to process.  */
 329 Lisp_Object Vcoding_system_for_write;
 330 /* Coding-system actually used in the latest I/O.  */
 331 Lisp_Object Vlast_coding_system_used;
 332
 333 /* A vector of length 256 which contains information about special
 334    Latin codes (especially for dealing with Microsoft codes).  */
 335 Lisp_Object Vlatin_extra_code_table;
 336
 337 /* Flag to inhibit code conversion of end-of-line format.  */
 338 int inhibit_eol_conversion;
 339
 340 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 341 int inherit_process_coding_system;
 342
 343 /* Coding system to be used to encode text for terminal display.  */
 344 struct coding_system terminal_coding;
 345
 346 /* Coding system to be used to encode text for terminal display when
 347    terminal coding system is nil.  */
 348 struct coding_system safe_terminal_coding;
 349
 350 /* Coding system of what is sent from terminal keyboard.  */
 351 struct coding_system keyboard_coding;
 352
 353 /* Default coding system to be used to write a file.  */
 354 struct coding_system default_buffer_file_coding;
 355
 356 Lisp_Object Vfile_coding_system_alist;
 357 Lisp_Object Vprocess_coding_system_alist;
 358 Lisp_Object Vnetwork_coding_system_alist;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qcoding_category, Qcoding_category_index;
 363
 364 /* List of symbols `coding-category-xxx' ordered by priority.  */
 365 Lisp_Object Vcoding_category_list;
 366
 367 /* Table of coding categories (Lisp symbols).  */
 368 Lisp_Object Vcoding_category_table;
 369
 370 /* Table of names of symbol for each coding-category.  */
 371 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 372   "coding-category-emacs-mule",
 373   "coding-category-sjis",
 374   "coding-category-iso-7",
 375   "coding-category-iso-7-tight",
 376   "coding-category-iso-8-1",
 377   "coding-category-iso-8-2",
 378   "coding-category-iso-7-else",
 379   "coding-category-iso-8-else",
 380   "coding-category-ccl",
 381   "coding-category-big5",
 382   "coding-category-raw-text",
 383   "coding-category-binary"
 384 };
 385
 386 /* Table of pointers to coding systems corresponding to each coding
 387    categories.  */
 388 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 389
 390 /* Table of coding category masks.  Nth element is a mask for a coding
 391    cateogry of which priority is Nth.  */
 392 static
 393 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 Lisp_Object Vcharset_revision_alist;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 \f
 415 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 416
 417 /* Emacs' internal format for encoding multiple character sets is a
 418    kind of multi-byte encoding, i.e. characters are encoded by
 419    variable-length sequences of one-byte codes.  ASCII characters
 420    and control characters (e.g. `tab', `newline') are represented by
 421    one-byte sequences which are their ASCII codes, in the range 0x00
 422    through 0x7F.  The other characters are represented by a sequence
 423    of `base leading-code', optional `extended leading-code', and one
 424    or two `position-code's.  The length of the sequence is determined
 425    by the base leading-code.  Leading-code takes the range 0x80
 426    through 0x9F, whereas extended leading-code and position-code take
 427    the range 0xA0 through 0xFF.  See `charset.h' for more details
 428    about leading-code and position-code.
 429
 430    There's one exception to this rule.  Special leading-code
 431    `leading-code-composition' denotes that the following several
 432    characters should be composed into one character.  Leading-codes of
 433    components (except for ASCII) are added 0x20.  An ASCII character
 434    component is represented by a 2-byte sequence of `0xA0' and
 435    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 436    details of composite character.  Hence, we can summarize the code
 437    range as follows:
 438
 439    --- CODE RANGE of Emacs' internal format ---
 440    (character set)      (range)
 441    ASCII                0x00 .. 0x7F
 442    ELSE (1st byte)      0x80 .. 0x9F
 443         (rest bytes)    0xA0 .. 0xFF
 444    ---------------------------------------------
 445
 446   */
 447
 448 enum emacs_code_class_type emacs_code_class[256];
 449
 450 /* Go to the next statement only if *SRC is accessible and the code is
 451    greater than 0xA0.  */
 452 #define CHECK_CODE_RANGE_A0_FF  \
 453   do {                          \
 454     if (src >= src_end)         \
 455       goto label_end_of_switch; \
 456     else if (*src++ < 0xA0)     \
 457       return 0;                 \
 458   } while (0)
 459
 460 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 461    Check if a text is encoded in Emacs' internal format.  If it is,
 462    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 463
 464 int
 465 detect_coding_emacs_mule (src, src_end)
 466      unsigned char *src, *src_end;
 467 {
 468   unsigned char c;
 469   int composing = 0;
 470
 471   while (src < src_end)
 472     {
 473       c = *src++;
 474
 475       if (composing)
 476         {
 477           if (c < 0xA0)
 478             composing = 0;
 479           else
 480             c -= 0x20;
 481         }
 482
 483       switch (emacs_code_class[c])
 484         {
 485         case EMACS_ascii_code:
 486         case EMACS_linefeed_code:
 487           break;
 488
 489         case EMACS_control_code:
 490           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 491             return 0;
 492           break;
 493
 494         case EMACS_invalid_code:
 495           return 0;
 496
 497         case EMACS_leading_code_composition: /* c == 0x80 */
 498           if (composing)
 499             CHECK_CODE_RANGE_A0_FF;
 500           else
 501             composing = 1;
 502           break;
 503
 504         case EMACS_leading_code_4:
 505           CHECK_CODE_RANGE_A0_FF;
 506           /* fall down to check it two more times ...  */
 507
 508         case EMACS_leading_code_3:
 509           CHECK_CODE_RANGE_A0_FF;
 510           /* fall down to check it one more time ...  */
 511
 512         case EMACS_leading_code_2:
 513           CHECK_CODE_RANGE_A0_FF;
 514           break;
 515
 516         default:
 517         label_end_of_switch:
 518           break;
 519         }
 520     }
 521   return CODING_CATEGORY_MASK_EMACS_MULE;
 522 }
 523
 524 \f
 525 /*** 3. ISO2022 handlers ***/
 526
 527 /* The following note describes the coding system ISO2022 briefly.
 528    Since the intention of this note is to help understand the
 529    functions in this file, some parts are NOT ACCURATE or OVERLY
 530    SIMPLIFIED.  For thorough understanding, please refer to the
 531    original document of ISO2022.
 532
 533    ISO2022 provides many mechanisms to encode several character sets
 534    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 535    is encoded using bytes less than 128.  This may make the encoded
 536    text a little bit longer, but the text passes more easily through
 537    several gateways, some of which strip off MSB (Most Signigant Bit).
 538
 539    There are two kinds of character sets: control character set and
 540    graphic character set.  The former contains control characters such
 541    as `newline' and `escape' to provide control functions (control
 542    functions are also provided by escape sequences).  The latter
 543    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 544    two control character sets and many graphic character sets.
 545
 546    Graphic character sets are classified into one of the following
 547    four classes, according to the number of bytes (DIMENSION) and
 548    number of characters in one dimension (CHARS) of the set:
 549    - DIMENSION1_CHARS94
 550    - DIMENSION1_CHARS96
 551    - DIMENSION2_CHARS94
 552    - DIMENSION2_CHARS96
 553
 554    In addition, each character set is assigned an identification tag,
 555    unique for each set, called "final character" (denoted as <F>
 556    hereafter).  The <F> of each character set is decided by ECMA(*)
 557    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 558    (0x30..0x3F are for private use only).
 559
 560    Note (*): ECMA = European Computer Manufacturers Association
 561
 562    Here are examples of graphic character set [NAME(<F>)]:
 563         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 564         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 565         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 566         o DIMENSION2_CHARS96 -- none for the moment
 567
 568    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 569         C0 [0x00..0x1F] -- control character plane 0
 570         GL [0x20..0x7F] -- graphic character plane 0
 571         C1 [0x80..0x9F] -- control character plane 1
 572         GR [0xA0..0xFF] -- graphic character plane 1
 573
 574    A control character set is directly designated and invoked to C0 or
 575    C1 by an escape sequence.  The most common case is that:
 576    - ISO646's  control character set is designated/invoked to C0, and
 577    - ISO6429's control character set is designated/invoked to C1,
 578    and usually these designations/invocations are omitted in encoded
 579    text.  In a 7-bit environment, only C0 can be used, and a control
 580    character for C1 is encoded by an appropriate escape sequence to
 581    fit into the environment.  All control characters for C1 are
 582    defined to have corresponding escape sequences.
 583
 584    A graphic character set is at first designated to one of four
 585    graphic registers (G0 through G3), then these graphic registers are
 586    invoked to GL or GR.  These designations and invocations can be
 587    done independently.  The most common case is that G0 is invoked to
 588    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 589    these invocations and designations are omitted in encoded text.
 590    In a 7-bit environment, only GL can be used.
 591
 592    When a graphic character set of CHARS94 is invoked to GL, codes
 593    0x20 and 0x7F of the GL area work as control characters SPACE and
 594    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 595    be used.
 596
 597    There are two ways of invocation: locking-shift and single-shift.
 598    With locking-shift, the invocation lasts until the next different
 599    invocation, whereas with single-shift, the invocation affects the
 600    following character only and doesn't affect the locking-shift
 601    state.  Invocations are done by the following control characters or
 602    escape sequences:
 603
 604    ----------------------------------------------------------------------
 605    abbrev  function                  cntrl escape seq   description
 606    ----------------------------------------------------------------------
 607    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 608    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 609    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 610    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 611    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 612    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 613    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 614    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 615    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 616    ----------------------------------------------------------------------
 617    (*) These are not used by any known coding system.
 618
 619    Control characters for these functions are defined by macros
 620    ISO_CODE_XXX in `coding.h'.
 621
 622    Designations are done by the following escape sequences:
 623    ----------------------------------------------------------------------
 624    escape sequence      description
 625    ----------------------------------------------------------------------
 626    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 627    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 628    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 629    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 630    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 631    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 632    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 633    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 634    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 635    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 636    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 637    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 638    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 639    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 640    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 641    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 642    ----------------------------------------------------------------------
 643
 644    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 645    of dimension 1, chars 94, and final character <F>, etc...
 646
 647    Note (*): Although these designations are not allowed in ISO2022,
 648    Emacs accepts them on decoding, and produces them on encoding
 649    CHARS96 character sets in a coding system which is characterized as
 650    7-bit environment, non-locking-shift, and non-single-shift.
 651
 652    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 653    '(' can be omitted.  We refer to this as "short-form" hereafter.
 654
 655    Now you may notice that there are a lot of ways for encoding the
 656    same multilingual text in ISO2022.  Actually, there exist many
 657    coding systems such as Compound Text (used in X11's inter client
 658    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 659    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 660    localized platforms), and all of these are variants of ISO2022.
 661
 662    In addition to the above, Emacs handles two more kinds of escape
 663    sequences: ISO6429's direction specification and Emacs' private
 664    sequence for specifying character composition.
 665
 666    ISO6429's direction specification takes the following form:
 667         o CSI ']'      -- end of the current direction
 668         o CSI '0' ']'  -- end of the current direction
 669         o CSI '1' ']'  -- start of left-to-right text
 670         o CSI '2' ']'  -- start of right-to-left text
 671    The control character CSI (0x9B: control sequence introducer) is
 672    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 673
 674    Character composition specification takes the following form:
 675         o ESC '0' -- start character composition
 676         o ESC '1' -- end character composition
 677    Since these are not standard escape sequences of any ISO standard,
 678    the use of them for these meaning is restricted to Emacs only.  */
 679
 680 enum iso_code_class_type iso_code_class[256];
 681
 682 #define CHARSET_OK(idx, charset)                                \
 683   (coding_system_table[idx]                                     \
 684    && (coding_system_table[idx]->safe_charsets[charset]         \
 685        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 686             (coding_system_table[idx], charset)                 \
 687            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 688
 689 #define SHIFT_OUT_OK(idx) \
 690   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 691
 692 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 693    Check if a text is encoded in ISO2022.  If it is, returns an
 694    integer in which appropriate flag bits any of:
 695         CODING_CATEGORY_MASK_ISO_7
 696         CODING_CATEGORY_MASK_ISO_7_TIGHT
 697         CODING_CATEGORY_MASK_ISO_8_1
 698         CODING_CATEGORY_MASK_ISO_8_2
 699         CODING_CATEGORY_MASK_ISO_7_ELSE
 700         CODING_CATEGORY_MASK_ISO_8_ELSE
 701    are set.  If a code which should never appear in ISO2022 is found,
 702    returns 0.  */
 703
 704 int
 705 detect_coding_iso2022 (src, src_end)
 706      unsigned char *src, *src_end;
 707 {
 708   int mask = CODING_CATEGORY_MASK_ISO;
 709   int mask_found = 0;
 710   int reg[4], shift_out = 0, single_shifting = 0;
 711   int c, c1, i, charset;
 712
 713   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 714   while (mask && src < src_end)
 715     {
 716       c = *src++;
 717       switch (c)
 718         {
 719         case ISO_CODE_ESC:
 720           single_shifting = 0;
 721           if (src >= src_end)
 722             break;
 723           c = *src++;
 724           if (c >= '(' && c <= '/')
 725             {
 726               /* Designation sequence for a charset of dimension 1.  */
 727               if (src >= src_end)
 728                 break;
 729               c1 = *src++;
 730               if (c1 < ' ' || c1 >= 0x80
 731                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 732                 /* Invalid designation sequence.  Just ignore.  */
 733                 break;
 734               reg[(c - '(') % 4] = charset;
 735             }
 736           else if (c == '$')
 737             {
 738               /* Designation sequence for a charset of dimension 2.  */
 739               if (src >= src_end)
 740                 break;
 741               c = *src++;
 742               if (c >= '@' && c <= 'B')
 743                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 744                 reg[0] = charset = iso_charset_table[1][0][c];
 745               else if (c >= '(' && c <= '/')
 746                 {
 747                   if (src >= src_end)
 748                     break;
 749                   c1 = *src++;
 750                   if (c1 < ' ' || c1 >= 0x80
 751                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 752                     /* Invalid designation sequence.  Just ignore.  */
 753                     break;
 754                   reg[(c - '(') % 4] = charset;
 755                 }
 756               else
 757                 /* Invalid designation sequence.  Just ignore.  */
 758                 break;
 759             }
 760           else if (c == 'N' || c == 'O')
 761             {
 762               /* ESC <Fe> for SS2 or SS3.  */
 763               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 764               break;
 765             }
 766           else if (c == '0' || c == '1' || c == '2')
 767             /* ESC <Fp> for start/end composition.  Just ignore.  */
 768             break;
 769           else
 770             /* Invalid escape sequence.  Just ignore.  */
 771             break;
 772
 773           /* We found a valid designation sequence for CHARSET.  */
 774           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 775           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 776             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 777           else
 778             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 779           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 780             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 781           else
 782             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 783           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 784             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 785           else
 786             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 787           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 788             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 789           else
 790             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 791           break;
 792
 793         case ISO_CODE_SO:
 794           single_shifting = 0;
 795           if (shift_out == 0
 796               && (reg[1] >= 0
 797                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 798                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 799             {
 800               /* Locking shift out.  */
 801               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 802               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 803             }
 804           break;
 805
 806         case ISO_CODE_SI:
 807           single_shifting = 0;
 808           if (shift_out == 1)
 809             {
 810               /* Locking shift in.  */
 811               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 812               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 813             }
 814           break;
 815
 816         case ISO_CODE_CSI:
 817           single_shifting = 0;
 818         case ISO_CODE_SS2:
 819         case ISO_CODE_SS3:
 820           {
 821             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 822
 823             if (c != ISO_CODE_CSI)
 824               {
 825                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 826                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 827                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 828                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 829                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 830                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 831                 single_shifting = 1;
 832               }
 833             if (VECTORP (Vlatin_extra_code_table)
 834                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 835               {
 836                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 837                     & CODING_FLAG_ISO_LATIN_EXTRA)
 838                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 839                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 840                     & CODING_FLAG_ISO_LATIN_EXTRA)
 841                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 842               }
 843             mask &= newmask;
 844             mask_found |= newmask;
 845           }
 846           break;
 847
 848         default:
 849           if (c < 0x80)
 850             {
 851               single_shifting = 0;
 852               break;
 853             }
 854           else if (c < 0xA0)
 855             {
 856               single_shifting = 0;
 857               if (VECTORP (Vlatin_extra_code_table)
 858                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 859                 {
 860                   int newmask = 0;
 861
 862                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 863                       & CODING_FLAG_ISO_LATIN_EXTRA)
 864                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 865                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 866                       & CODING_FLAG_ISO_LATIN_EXTRA)
 867                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 868                   mask &= newmask;
 869                   mask_found |= newmask;
 870                 }
 871               else
 872                 return 0;
 873             }
 874           else
 875             {
 876               unsigned char *src_begin = src;
 877
 878               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 879                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 880               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 881               /* Check the length of succeeding codes of the range
 882                  0xA0..0FF.  If the byte length is odd, we exclude
 883                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
 884                  when we are not single shifting.  */
 885               if (!single_shifting)
 886                 {
 887                   while (src < src_end && *src >= 0xA0)
 888                     src++;
 889                   if ((src - src_begin - 1) & 1 && src < src_end)
 890                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 891                   else
 892                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
 893                 }
 894             }
 895           break;
 896         }
 897     }
 898
 899   return (mask & mask_found);
 900 }
 901
 902 /* Decode a character of which charset is CHARSET and the 1st position
 903    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 904    fetched from SRC and set to C2.  If CHARSET is negative, it means
 905    that we are decoding ill formed text, and what we can do is just to
 906    read C1 as is.  */
 907
 908 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 909   do {                                                                  \
 910     int c_alt, charset_alt = (charset);                                 \
 911     if (COMPOSING_HEAD_P (coding->composing))                           \
 912       {                                                                 \
 913         *dst++ = LEADING_CODE_COMPOSITION;                              \
 914         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 915           /* To tell composition rules are embeded.  */                 \
 916           *dst++ = 0xFF;                                                \
 917         coding->composing += 2;                                         \
 918       }                                                                 \
 919     if (charset_alt >= 0)                                               \
 920       {                                                                 \
 921         if (CHARSET_DIMENSION (charset_alt) == 2)                       \
 922           {                                                             \
 923             ONE_MORE_BYTE (c2);                                         \
 924             if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F         \
 925                 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0)  \
 926               {                                                         \
 927                 src--;                                                  \
 928                 charset_alt = CHARSET_ASCII;                            \
 929               }                                                         \
 930           }                                                             \
 931         if (!NILP (translation_table)                                   \
 932             && ((c_alt = translate_char (translation_table,             \
 933                                          -1, charset_alt, c1, c2)) >= 0)) \
 934           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 935       }                                                                 \
 936     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 937       DECODE_CHARACTER_ASCII (c1);                                      \
 938     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 939       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 940     else                                                                \
 941       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 942     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 943       /* To tell a composition rule follows.  */                        \
 944       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 945   } while (0)
 946
 947 /* Set designation state into CODING.  */
 948 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
 949   do {                                                                     \
 950     int charset;                                                           \
 951                                                                            \
 952     if (final_char < '0' || final_char >= 128)                             \
 953       goto label_invalid_code;                                             \
 954     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
 955                                  make_number (chars),                      \
 956                                  make_number (final_char));                \
 957     if (charset >= 0                                                       \
 958         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
 959             || coding->safe_charsets[charset]))                            \
 960       {                                                                    \
 961         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
 962             && reg == 0                                                    \
 963             && charset == CHARSET_ASCII)                                   \
 964           {                                                                \
 965             /* We should insert this designation sequence as is so         \
 966                that it is surely written back to a file.  */               \
 967             coding->spec.iso2022.last_invalid_designation_register = -1;   \
 968             goto label_invalid_code;                                       \
 969           }                                                                \
 970         coding->spec.iso2022.last_invalid_designation_register = -1;       \
 971         if ((coding->mode & CODING_MODE_DIRECTION)                         \
 972             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
 973           charset = CHARSET_REVERSE_CHARSET (charset);                     \
 974         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
 975       }                                                                    \
 976     else                                                                   \
 977       {                                                                    \
 978         coding->spec.iso2022.last_invalid_designation_register = reg;      \
 979         goto label_invalid_code;                                           \
 980       }                                                                    \
 981   } while (0)
 982
 983 /* Return 0 if there's a valid composing sequence starting at SRC and
 984    ending before SRC_END, else return -1.  */
 985
 986 int
 987 check_composing_code (coding, src, src_end)
 988      struct coding_system *coding;
 989      unsigned char *src, *src_end;
 990 {
 991   int charset, c, c1, dim;
 992
 993   while (src < src_end)
 994     {
 995       c = *src++;
 996       if (c >= 0x20)
 997         continue;
 998       if (c != ISO_CODE_ESC || src >= src_end)
 999         return -1;
1000       c = *src++;
1001       if (c == '1') /* end of compsition */
1002         return 0;
1003       if (src + 2 >= src_end
1004           || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
1005         return -1;
1006
1007       dim = (c == '$');
1008       if (dim == 1)
1009         c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1010       if (c >= '(' && c <= '/')
1011         {
1012           c1 = *src++;
1013           if ((c1 < ' ' || c1 >= 0x80)
1014               || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1015               || ! coding->safe_charsets[charset]
1016               || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1017                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1018             return -1;
1019         }
1020       else
1021         return -1;
1022     }
1023
1024   /* We have not found the sequence "ESC 1".  */
1025   return -1;
1026 }
1027
1028 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1029
1030 int
1031 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1032      struct coding_system *coding;
1033      unsigned char *source, *destination;
1034      int src_bytes, dst_bytes;
1035 {
1036   unsigned char *src = source;
1037   unsigned char *src_end = source + src_bytes;
1038   unsigned char *dst = destination;
1039   unsigned char *dst_end = destination + dst_bytes;
1040   /* Since the maximum bytes produced by each loop is 7, we subtract 6
1041      from DST_END to assure that overflow checking is necessary only
1042      at the head of loop.  */
1043   unsigned char *adjusted_dst_end = dst_end - 6;
1044   int charset;
1045   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1046   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1047   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1048   Lisp_Object translation_table
1049     = coding->translation_table_for_decode;
1050   int result = CODING_FINISH_NORMAL;
1051
1052   if (!NILP (Venable_character_translation) && NILP (translation_table))
1053     translation_table = Vstandard_translation_table_for_decode;
1054
1055   coding->produced_char = 0;
1056   coding->fake_multibyte = 0;
1057   while (src < src_end && (dst_bytes
1058                            ? (dst < adjusted_dst_end)
1059                            : (dst < src - 6)))
1060     {
1061       /* SRC_BASE remembers the start position in source in each loop.
1062          The loop will be exited when there's not enough source text
1063          to analyze long escape sequence or 2-byte code (within macros
1064          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
1065          to SRC_BASE before exiting.  */
1066       unsigned char *src_base = src;
1067       int c1 = *src++, c2;
1068
1069       switch (iso_code_class [c1])
1070         {
1071         case ISO_0x20_or_0x7F:
1072           if (!coding->composing
1073               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1074             {
1075               /* This is SPACE or DEL.  */
1076               *dst++ = c1;
1077               coding->produced_char++;
1078               break;
1079             }
1080           /* This is a graphic character, we fall down ...  */
1081
1082         case ISO_graphic_plane_0:
1083           if (coding->composing == COMPOSING_WITH_RULE_RULE)
1084             {
1085               /* This is a composition rule.  */
1086               *dst++ = c1 | 0x80;
1087               coding->composing = COMPOSING_WITH_RULE_TAIL;
1088             }
1089           else
1090             DECODE_ISO_CHARACTER (charset0, c1);
1091           break;
1092
1093         case ISO_0xA0_or_0xFF:
1094           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1095               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1096             goto label_invalid_code;
1097           /* This is a graphic character, we fall down ... */
1098
1099         case ISO_graphic_plane_1:
1100           if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1101             goto label_invalid_code;
1102           else
1103             DECODE_ISO_CHARACTER (charset1, c1);
1104           break;
1105
1106         case ISO_control_code:
1107           /* All ISO2022 control characters in this class have the
1108              same representation in Emacs internal format.  */
1109           if (c1 == '\n'
1110               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1111               && (coding->eol_type == CODING_EOL_CR
1112                   || coding->eol_type == CODING_EOL_CRLF))
1113             {
1114               result = CODING_FINISH_INCONSISTENT_EOL;
1115               goto label_end_of_loop_2;
1116             }
1117           *dst++ = c1;
1118           coding->produced_char++;
1119           if (c1 >= 0x80)
1120             coding->fake_multibyte = 1;
1121           break;
1122
1123         case ISO_carriage_return:
1124           if (coding->eol_type == CODING_EOL_CR)
1125             *dst++ = '\n';
1126           else if (coding->eol_type == CODING_EOL_CRLF)
1127             {
1128               ONE_MORE_BYTE (c1);
1129               if (c1 == ISO_CODE_LF)
1130                 *dst++ = '\n';
1131               else
1132                 {
1133                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1134                     {
1135                       result = CODING_FINISH_INCONSISTENT_EOL;
1136                       goto label_end_of_loop_2;
1137                     }
1138                   src--;
1139                   *dst++ = '\r';
1140                 }
1141             }
1142           else
1143             *dst++ = c1;
1144           coding->produced_char++;
1145           break;
1146
1147         case ISO_shift_out:
1148           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1149               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1150             goto label_invalid_code;
1151           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1152           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1153           break;
1154
1155         case ISO_shift_in:
1156           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1157             goto label_invalid_code;
1158           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1159           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1160           break;
1161
1162         case ISO_single_shift_2_7:
1163         case ISO_single_shift_2:
1164           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1165             goto label_invalid_code;
1166           /* SS2 is handled as an escape sequence of ESC 'N' */
1167           c1 = 'N';
1168           goto label_escape_sequence;
1169
1170         case ISO_single_shift_3:
1171           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1172             goto label_invalid_code;
1173           /* SS2 is handled as an escape sequence of ESC 'O' */
1174           c1 = 'O';
1175           goto label_escape_sequence;
1176
1177         case ISO_control_sequence_introducer:
1178           /* CSI is handled as an escape sequence of ESC '[' ...  */
1179           c1 = '[';
1180           goto label_escape_sequence;
1181
1182         case ISO_escape:
1183           ONE_MORE_BYTE (c1);
1184         label_escape_sequence:
1185           /* Escape sequences handled by Emacs are invocation,
1186              designation, direction specification, and character
1187              composition specification.  */
1188           switch (c1)
1189             {
1190             case '&':           /* revision of following character set */
1191               ONE_MORE_BYTE (c1);
1192               if (!(c1 >= '@' && c1 <= '~'))
1193                 goto label_invalid_code;
1194               ONE_MORE_BYTE (c1);
1195               if (c1 != ISO_CODE_ESC)
1196                 goto label_invalid_code;
1197               ONE_MORE_BYTE (c1);
1198               goto label_escape_sequence;
1199
1200             case '$':           /* designation of 2-byte character set */
1201               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1202                 goto label_invalid_code;
1203               ONE_MORE_BYTE (c1);
1204               if (c1 >= '@' && c1 <= 'B')
1205                 {       /* designation of JISX0208.1978, GB2312.1980,
1206                            or JISX0208.1980 */
1207                   DECODE_DESIGNATION (0, 2, 94, c1);
1208                 }
1209               else if (c1 >= 0x28 && c1 <= 0x2B)
1210                 {       /* designation of DIMENSION2_CHARS94 character set */
1211                   ONE_MORE_BYTE (c2);
1212                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1213                 }
1214               else if (c1 >= 0x2C && c1 <= 0x2F)
1215                 {       /* designation of DIMENSION2_CHARS96 character set */
1216                   ONE_MORE_BYTE (c2);
1217                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1218                 }
1219               else
1220                 goto label_invalid_code;
1221               break;
1222
1223             case 'n':           /* invocation of locking-shift-2 */
1224               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1225                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1226                 goto label_invalid_code;
1227               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1228               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1229               break;
1230
1231             case 'o':           /* invocation of locking-shift-3 */
1232               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1233                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1234                 goto label_invalid_code;
1235               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1236               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1237               break;
1238
1239             case 'N':           /* invocation of single-shift-2 */
1240               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1241                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1242                 goto label_invalid_code;
1243               ONE_MORE_BYTE (c1);
1244               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1245               DECODE_ISO_CHARACTER (charset, c1);
1246               break;
1247
1248             case 'O':           /* invocation of single-shift-3 */
1249               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1250                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1251                 goto label_invalid_code;
1252               ONE_MORE_BYTE (c1);
1253               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1254               DECODE_ISO_CHARACTER (charset, c1);
1255               break;
1256
1257             case '0': case '2': /* start composing */
1258               /* Before processing composing, we must be sure that all
1259                  characters being composed are supported by CODING.
1260                  If not, we must give up composing.  */
1261               if (check_composing_code (coding, src, src_end) == 0)
1262                 {
1263                   /* We are looking at a valid composition sequence.  */
1264                   coding->composing = (c1 == '0'
1265                                        ? COMPOSING_NO_RULE_HEAD
1266                                        : COMPOSING_WITH_RULE_HEAD);
1267                   coding->composed_chars = 0;
1268                 }
1269               else
1270                 {
1271                   *dst++ = ISO_CODE_ESC;
1272                   *dst++ = c1;
1273                   coding->produced_char += 2;
1274                 }
1275               break;
1276
1277             case '1':           /* end composing */
1278               if (!coding->composing)
1279                 {
1280                   *dst++ = ISO_CODE_ESC;
1281                   *dst++ = c1;
1282                   coding->produced_char += 2;
1283                   break;
1284                 }
1285
1286               if (coding->composed_chars > 0)
1287                 {
1288                   if (coding->composed_chars == 1)
1289                     {
1290                       unsigned char *this_char_start = dst;
1291                       int this_bytes;
1292
1293                       /* Only one character is in the composing
1294                          sequence.  Make it a normal character.  */
1295                       while (*--this_char_start != LEADING_CODE_COMPOSITION);
1296                       dst = (this_char_start
1297                              + (coding->composing == COMPOSING_NO_RULE_TAIL
1298                                 ? 1 : 2));
1299                       *dst -= 0x20;
1300                       if (*dst == 0x80)
1301                         *++dst &= 0x7F;
1302                       this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1303                       while (this_bytes--) *this_char_start++ = *dst++;
1304                       dst = this_char_start;
1305                     }
1306                   coding->produced_char++;
1307                 }
1308               coding->composing = COMPOSING_NO;
1309               break;
1310
1311             case '[':           /* specification of direction */
1312               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1313                 goto label_invalid_code;
1314               /* For the moment, nested direction is not supported.
1315                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1316                  left-to-right, and nozero means right-to-left.  */
1317               ONE_MORE_BYTE (c1);
1318               switch (c1)
1319                 {
1320                 case ']':       /* end of the current direction */
1321                   coding->mode &= ~CODING_MODE_DIRECTION;
1322
1323                 case '0':       /* end of the current direction */
1324                 case '1':       /* start of left-to-right direction */
1325                   ONE_MORE_BYTE (c1);
1326                   if (c1 == ']')
1327                     coding->mode &= ~CODING_MODE_DIRECTION;
1328                   else
1329                     goto label_invalid_code;
1330                   break;
1331
1332                 case '2':       /* start of right-to-left direction */
1333                   ONE_MORE_BYTE (c1);
1334                   if (c1 == ']')
1335                     coding->mode |= CODING_MODE_DIRECTION;
1336                   else
1337                     goto label_invalid_code;
1338                   break;
1339
1340                 default:
1341                   goto label_invalid_code;
1342                 }
1343               break;
1344
1345             default:
1346               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1347                 goto label_invalid_code;
1348               if (c1 >= 0x28 && c1 <= 0x2B)
1349                 {       /* designation of DIMENSION1_CHARS94 character set */
1350                   ONE_MORE_BYTE (c2);
1351                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1352                 }
1353               else if (c1 >= 0x2C && c1 <= 0x2F)
1354                 {       /* designation of DIMENSION1_CHARS96 character set */
1355                   ONE_MORE_BYTE (c2);
1356                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1357                 }
1358               else
1359                 {
1360                   goto label_invalid_code;
1361                 }
1362             }
1363           /* We must update these variables now.  */
1364           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1365           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1366           break;
1367
1368         label_invalid_code:
1369           while (src_base < src)
1370             *dst++ = *src_base++;
1371           coding->fake_multibyte = 1;
1372         }
1373       continue;
1374
1375     label_end_of_loop:
1376       result = CODING_FINISH_INSUFFICIENT_SRC;
1377     label_end_of_loop_2:
1378       src = src_base;
1379       break;
1380     }
1381
1382   if (src < src_end)
1383     {
1384       if (result == CODING_FINISH_NORMAL)
1385         result = CODING_FINISH_INSUFFICIENT_DST;
1386       else if (result != CODING_FINISH_INCONSISTENT_EOL
1387                && coding->mode & CODING_MODE_LAST_BLOCK)
1388         {
1389           /* This is the last block of the text to be decoded.  We had
1390              better just flush out all remaining codes in the text
1391              although they are not valid characters.  */
1392           src_bytes = src_end - src;
1393           if (dst_bytes && (dst_end - dst < src_bytes))
1394             src_bytes = dst_end - dst;
1395           bcopy (src, dst, src_bytes);
1396           dst += src_bytes;
1397           src += src_bytes;
1398           coding->fake_multibyte = 1;
1399         }
1400     }
1401
1402   coding->consumed = coding->consumed_char = src - source;
1403   coding->produced = dst - destination;
1404   return result;
1405 }
1406
1407 /* ISO2022 encoding stuff.  */
1408
1409 /*
1410    It is not enough to say just "ISO2022" on encoding, we have to
1411    specify more details.  In Emacs, each coding system of ISO2022
1412    variant has the following specifications:
1413         1. Initial designation to G0 thru G3.
1414         2. Allows short-form designation?
1415         3. ASCII should be designated to G0 before control characters?
1416         4. ASCII should be designated to G0 at end of line?
1417         5. 7-bit environment or 8-bit environment?
1418         6. Use locking-shift?
1419         7. Use Single-shift?
1420    And the following two are only for Japanese:
1421         8. Use ASCII in place of JIS0201-1976-Roman?
1422         9. Use JISX0208-1983 in place of JISX0208-1978?
1423    These specifications are encoded in `coding->flags' as flag bits
1424    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1425    details.
1426 */
1427
1428 /* Produce codes (escape sequence) for designating CHARSET to graphic
1429    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1430    the coding system CODING allows, produce designation sequence of
1431    short-form.  */
1432
1433 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1434   do {                                                                  \
1435     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1436     char *intermediate_char_94 = "()*+";                                \
1437     char *intermediate_char_96 = ",-./";                                \
1438     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1439     if (revision < 255)                                                 \
1440       {                                                                 \
1441         *dst++ = ISO_CODE_ESC;                                          \
1442         *dst++ = '&';                                                   \
1443         *dst++ = '@' + revision;                                        \
1444       }                                                                 \
1445     *dst++ = ISO_CODE_ESC;                                              \
1446     if (CHARSET_DIMENSION (charset) == 1)                               \
1447       {                                                                 \
1448         if (CHARSET_CHARS (charset) == 94)                              \
1449           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1450         else                                                            \
1451           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1452       }                                                                 \
1453     else                                                                \
1454       {                                                                 \
1455         *dst++ = '$';                                                   \
1456         if (CHARSET_CHARS (charset) == 94)                              \
1457           {                                                             \
1458             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1459                 || reg != 0                                             \
1460                 || final_char < '@' || final_char > 'B')                \
1461               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1462           }                                                             \
1463         else                                                            \
1464           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1465       }                                                                 \
1466     *dst++ = final_char;                                                \
1467     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1468   } while (0)
1469
1470 /* The following two macros produce codes (control character or escape
1471    sequence) for ISO2022 single-shift functions (single-shift-2 and
1472    single-shift-3).  */
1473
1474 #define ENCODE_SINGLE_SHIFT_2                           \
1475   do {                                                  \
1476     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1477       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1478     else                                                \
1479       {                                                 \
1480         *dst++ = ISO_CODE_SS2;                          \
1481         coding->fake_multibyte = 1;                     \
1482       }                                                 \
1483     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1484   } while (0)
1485
1486 #define ENCODE_SINGLE_SHIFT_3                           \
1487   do {                                                  \
1488     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1489       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1490     else                                                \
1491       {                                                 \
1492         *dst++ = ISO_CODE_SS3;                          \
1493         coding->fake_multibyte = 1;                     \
1494       }                                                 \
1495     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1496   } while (0)
1497
1498 /* The following four macros produce codes (control character or
1499    escape sequence) for ISO2022 locking-shift functions (shift-in,
1500    shift-out, locking-shift-2, and locking-shift-3).  */
1501
1502 #define ENCODE_SHIFT_IN                         \
1503   do {                                          \
1504     *dst++ = ISO_CODE_SI;                       \
1505     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1506   } while (0)
1507
1508 #define ENCODE_SHIFT_OUT                        \
1509   do {                                          \
1510     *dst++ = ISO_CODE_SO;                       \
1511     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1512   } while (0)
1513
1514 #define ENCODE_LOCKING_SHIFT_2                  \
1515   do {                                          \
1516     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1517     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1518   } while (0)
1519
1520 #define ENCODE_LOCKING_SHIFT_3                  \
1521   do {                                          \
1522     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1523     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1524   } while (0)
1525
1526 /* Produce codes for a DIMENSION1 character whose character set is
1527    CHARSET and whose position-code is C1.  Designation and invocation
1528    sequences are also produced in advance if necessary.  */
1529
1530
1531 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1532   do {                                                                  \
1533     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1534       {                                                                 \
1535         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1536           *dst++ = c1 & 0x7F;                                           \
1537         else                                                            \
1538           *dst++ = c1 | 0x80;                                           \
1539         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1540         break;                                                          \
1541       }                                                                 \
1542     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1543       {                                                                 \
1544         *dst++ = c1 & 0x7F;                                             \
1545         break;                                                          \
1546       }                                                                 \
1547     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1548       {                                                                 \
1549         *dst++ = c1 | 0x80;                                             \
1550         break;                                                          \
1551       }                                                                 \
1552     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1553              && !coding->safe_charsets[charset])                        \
1554       {                                                                 \
1555         /* We should not encode this character, instead produce one or  \
1556            two `?'s.  */                                                \
1557         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1558         if (CHARSET_WIDTH (charset) == 2)                               \
1559           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1560         break;                                                          \
1561       }                                                                 \
1562     else                                                                \
1563       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1564          must invoke it, or, at first, designate it to some graphic     \
1565          register.  Then repeat the loop to actually produce the        \
1566          character.  */                                                 \
1567       dst = encode_invocation_designation (charset, coding, dst);       \
1568   } while (1)
1569
1570 /* Produce codes for a DIMENSION2 character whose character set is
1571    CHARSET and whose position-codes are C1 and C2.  Designation and
1572    invocation codes are also produced in advance if necessary.  */
1573
1574 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1575   do {                                                                  \
1576     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1577       {                                                                 \
1578         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1579           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1580         else                                                            \
1581           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1582         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1583         break;                                                          \
1584       }                                                                 \
1585     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1586       {                                                                 \
1587         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1588         break;                                                          \
1589       }                                                                 \
1590     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1591       {                                                                 \
1592         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1593         break;                                                          \
1594       }                                                                 \
1595     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1596              && !coding->safe_charsets[charset])                        \
1597       {                                                                 \
1598         /* We should not encode this character, instead produce one or  \
1599            two `?'s.  */                                                \
1600         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1601         if (CHARSET_WIDTH (charset) == 2)                               \
1602           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1603         break;                                                          \
1604       }                                                                 \
1605     else                                                                \
1606       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1607          must invoke it, or, at first, designate it to some graphic     \
1608          register.  Then repeat the loop to actually produce the        \
1609          character.  */                                                 \
1610       dst = encode_invocation_designation (charset, coding, dst);       \
1611   } while (1)
1612
1613 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                           \
1614   do {                                                                  \
1615     int c_alt, charset_alt;                                             \
1616     if (!NILP (translation_table)                                       \
1617         && ((c_alt = translate_char (translation_table, -1,             \
1618                                      charset, c1, c2))                  \
1619             >= 0))                                                      \
1620       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                          \
1621     else                                                                \
1622       charset_alt = charset;                                            \
1623     if (CHARSET_DEFINED_P (charset_alt))                                \
1624       {                                                                 \
1625         if (CHARSET_DIMENSION (charset_alt) == 1)                       \
1626           {                                                             \
1627             if (charset == CHARSET_ASCII                                \
1628                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)           \
1629               charset_alt = charset_latin_jisx0201;                     \
1630             ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);          \
1631           }                                                             \
1632         else                                                            \
1633           {                                                             \
1634             if (charset == charset_jisx0208                             \
1635                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)          \
1636               charset_alt = charset_jisx0208_1978;                      \
1637             ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);      \
1638           }                                                             \
1639       }                                                                 \
1640     else                                                                \
1641       {                                                                 \
1642         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1643           {                                                             \
1644             *dst++ = charset & 0x7f;                                    \
1645             *dst++ = c1 & 0x7f;                                         \
1646             if (c2)                                                     \
1647               *dst++ = c2 & 0x7f;                                       \
1648           }                                                             \
1649         else                                                            \
1650           {                                                             \
1651             *dst++ = charset;                                           \
1652             *dst++ = c1;                                                \
1653             if (c2)                                                     \
1654               *dst++ = c2;                                              \
1655           }                                                             \
1656       }                                                                 \
1657     if (! COMPOSING_P (coding->composing))                              \
1658       coding->consumed_char++;                                          \
1659   } while (0)
1660
1661 /* Produce designation and invocation codes at a place pointed by DST
1662    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1663    Return new DST.  */
1664
1665 unsigned char *
1666 encode_invocation_designation (charset, coding, dst)
1667      int charset;
1668      struct coding_system *coding;
1669      unsigned char *dst;
1670 {
1671   int reg;                      /* graphic register number */
1672
1673   /* At first, check designations.  */
1674   for (reg = 0; reg < 4; reg++)
1675     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1676       break;
1677
1678   if (reg >= 4)
1679     {
1680       /* CHARSET is not yet designated to any graphic registers.  */
1681       /* At first check the requested designation.  */
1682       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1683       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1684         /* Since CHARSET requests no special designation, designate it
1685            to graphic register 0.  */
1686         reg = 0;
1687
1688       ENCODE_DESIGNATION (charset, reg, coding);
1689     }
1690
1691   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1692       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1693     {
1694       /* Since the graphic register REG is not invoked to any graphic
1695          planes, invoke it to graphic plane 0.  */
1696       switch (reg)
1697         {
1698         case 0:                 /* graphic register 0 */
1699           ENCODE_SHIFT_IN;
1700           break;
1701
1702         case 1:                 /* graphic register 1 */
1703           ENCODE_SHIFT_OUT;
1704           break;
1705
1706         case 2:                 /* graphic register 2 */
1707           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1708             ENCODE_SINGLE_SHIFT_2;
1709           else
1710             ENCODE_LOCKING_SHIFT_2;
1711           break;
1712
1713         case 3:                 /* graphic register 3 */
1714           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1715             ENCODE_SINGLE_SHIFT_3;
1716           else
1717             ENCODE_LOCKING_SHIFT_3;
1718           break;
1719         }
1720     }
1721   return dst;
1722 }
1723
1724 /* The following two macros produce codes for indicating composition.  */
1725 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1726 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1727 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1728
1729 /* The following three macros produce codes for indicating direction
1730    of text.  */
1731 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1732   do {                                                  \
1733     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1734       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1735     else                                                \
1736       *dst++ = ISO_CODE_CSI;                            \
1737   } while (0)
1738
1739 #define ENCODE_DIRECTION_R2L    \
1740   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1741
1742 #define ENCODE_DIRECTION_L2R    \
1743   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1744
1745 /* Produce codes for designation and invocation to reset the graphic
1746    planes and registers to initial state.  */
1747 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1748   do {                                                                      \
1749     int reg;                                                                \
1750     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1751       ENCODE_SHIFT_IN;                                                      \
1752     for (reg = 0; reg < 4; reg++)                                           \
1753       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1754           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1755               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1756         ENCODE_DESIGNATION                                                  \
1757           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1758   } while (0)
1759
1760 /* Produce designation sequences of charsets in the line started from
1761    SRC to a place pointed by *DSTP, and update DSTP.
1762
1763    If the current block ends before any end-of-line, we may fail to
1764    find all the necessary designations.  */
1765
1766 void
1767 encode_designation_at_bol (coding, table, src, src_end, dstp)
1768      struct coding_system *coding;
1769      Lisp_Object table;
1770      unsigned char *src, *src_end, **dstp;
1771 {
1772   int charset, c, found = 0, reg;
1773   /* Table of charsets to be designated to each graphic register.  */
1774   int r[4];
1775   unsigned char *dst = *dstp;
1776
1777   for (reg = 0; reg < 4; reg++)
1778     r[reg] = -1;
1779
1780   while (src < src_end && *src != '\n' && found < 4)
1781     {
1782       int bytes = BYTES_BY_CHAR_HEAD (*src);
1783
1784       if (NILP (table))
1785         charset = CHARSET_AT (src);
1786       else
1787         {
1788           int c_alt;
1789           unsigned char c1, c2;
1790
1791           SPLIT_STRING(src, bytes, charset, c1, c2);
1792           if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
1793             charset = CHAR_CHARSET (c_alt);
1794         }
1795
1796       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1797       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1798         {
1799           found++;
1800           r[reg] = charset;
1801         }
1802
1803       src += bytes;
1804     }
1805
1806   if (found)
1807     {
1808       for (reg = 0; reg < 4; reg++)
1809         if (r[reg] >= 0
1810             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1811           ENCODE_DESIGNATION (r[reg], reg, coding);
1812       *dstp = dst;
1813     }
1814 }
1815
1816 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1817
1818 int
1819 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1820      struct coding_system *coding;
1821      unsigned char *source, *destination;
1822      int src_bytes, dst_bytes;
1823 {
1824   unsigned char *src = source;
1825   unsigned char *src_end = source + src_bytes;
1826   unsigned char *dst = destination;
1827   unsigned char *dst_end = destination + dst_bytes;
1828   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1829      from DST_END to assure overflow checking is necessary only at the
1830      head of loop.  */
1831   unsigned char *adjusted_dst_end = dst_end - 19;
1832   Lisp_Object translation_table
1833       = coding->translation_table_for_encode;
1834   int result = CODING_FINISH_NORMAL;
1835
1836   if (!NILP (Venable_character_translation) && NILP (translation_table))
1837     translation_table = Vstandard_translation_table_for_encode;
1838
1839   coding->consumed_char = 0;
1840   coding->fake_multibyte = 0;
1841   while (src < src_end && (dst_bytes
1842                            ? (dst < adjusted_dst_end)
1843                            : (dst < src - 19)))
1844     {
1845       /* SRC_BASE remembers the start position in source in each loop.
1846          The loop will be exited when there's not enough source text
1847          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1848          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1849          reset to SRC_BASE before exiting.  */
1850       unsigned char *src_base = src;
1851       int charset, c1, c2, c3, c4;
1852
1853       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1854           && CODING_SPEC_ISO_BOL (coding))
1855         {
1856           /* We have to produce designation sequences if any now.  */
1857           encode_designation_at_bol (coding, translation_table,
1858                                      src, src_end, &dst);
1859           CODING_SPEC_ISO_BOL (coding) = 0;
1860         }
1861
1862       c1 = *src++;
1863       /* If we are seeing a component of a composite character, we are
1864          seeing a leading-code encoded irregularly for composition, or
1865          a composition rule if composing with rule.  We must set C1 to
1866          a normal leading-code or an ASCII code.  If we are not seeing
1867          a composite character, we must reset composition,
1868          designation, and invocation states.  */
1869       if (COMPOSING_P (coding->composing))
1870         {
1871           if (c1 < 0xA0)
1872             {
1873               /* We are not in a composite character any longer.  */
1874               coding->composing = COMPOSING_NO;
1875               ENCODE_RESET_PLANE_AND_REGISTER;
1876               ENCODE_COMPOSITION_END;
1877             }
1878           else
1879             {
1880               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1881                 {
1882                   *dst++ = c1 & 0x7F;
1883                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1884                   continue;
1885                 }
1886               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1887                 coding->composing = COMPOSING_WITH_RULE_RULE;
1888               if (c1 == 0xA0)
1889                 {
1890                   /* This is an ASCII component.  */
1891                   ONE_MORE_BYTE (c1);
1892                   c1 &= 0x7F;
1893                 }
1894               else
1895                 /* This is a leading-code of non ASCII component.  */
1896                 c1 -= 0x20;
1897             }
1898         }
1899
1900       /* Now encode one character.  C1 is a control character, an
1901          ASCII character, or a leading-code of multi-byte character.  */
1902       switch (emacs_code_class[c1])
1903         {
1904         case EMACS_ascii_code:
1905           c2 = 0;
1906           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1907           break;
1908
1909         case EMACS_control_code:
1910           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1911             ENCODE_RESET_PLANE_AND_REGISTER;
1912           *dst++ = c1;
1913           coding->consumed_char++;
1914           break;
1915
1916         case EMACS_carriage_return_code:
1917           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
1918             {
1919               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1920                 ENCODE_RESET_PLANE_AND_REGISTER;
1921               *dst++ = c1;
1922               coding->consumed_char++;
1923               break;
1924             }
1925           /* fall down to treat '\r' as '\n' ...  */
1926
1927         case EMACS_linefeed_code:
1928           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1929             ENCODE_RESET_PLANE_AND_REGISTER;
1930           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1931             bcopy (coding->spec.iso2022.initial_designation,
1932                    coding->spec.iso2022.current_designation,
1933                    sizeof coding->spec.iso2022.initial_designation);
1934           if (coding->eol_type == CODING_EOL_LF
1935               || coding->eol_type == CODING_EOL_UNDECIDED)
1936             *dst++ = ISO_CODE_LF;
1937           else if (coding->eol_type == CODING_EOL_CRLF)
1938             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1939           else
1940             *dst++ = ISO_CODE_CR;
1941           CODING_SPEC_ISO_BOL (coding) = 1;
1942           coding->consumed_char++;
1943           break;
1944
1945         case EMACS_leading_code_2:
1946           ONE_MORE_BYTE (c2);
1947           c3 = 0;
1948           if (c2 < 0xA0)
1949             {
1950               /* invalid sequence */
1951               *dst++ = c1;
1952               src--;
1953               coding->consumed_char++;
1954             }
1955           else
1956             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1957           break;
1958
1959         case EMACS_leading_code_3:
1960           TWO_MORE_BYTES (c2, c3);
1961           c4 = 0;
1962           if (c2 < 0xA0 || c3 < 0xA0)
1963             {
1964               /* invalid sequence */
1965               *dst++ = c1;
1966               src -= 2;
1967               coding->consumed_char++;
1968             }
1969           else if (c1 < LEADING_CODE_PRIVATE_11)
1970             ENCODE_ISO_CHARACTER (c1, c2, c3);
1971           else
1972             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1973           break;
1974
1975         case EMACS_leading_code_4:
1976           THREE_MORE_BYTES (c2, c3, c4);
1977           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1978             {
1979               /* invalid sequence */
1980               *dst++ = c1;
1981               src -= 3;
1982               coding->consumed_char++;
1983             }
1984           else
1985             ENCODE_ISO_CHARACTER (c2, c3, c4);
1986           break;
1987
1988         case EMACS_leading_code_composition:
1989           ONE_MORE_BYTE (c2);
1990           if (c2 < 0xA0)
1991             {
1992               /* invalid sequence */
1993               *dst++ = c1;
1994               src--;
1995               coding->consumed_char++;
1996             }
1997           else if (c2 == 0xFF)
1998             {
1999               ENCODE_RESET_PLANE_AND_REGISTER;
2000               coding->composing = COMPOSING_WITH_RULE_HEAD;
2001               ENCODE_COMPOSITION_WITH_RULE_START;
2002               coding->consumed_char++;
2003             }
2004           else
2005             {
2006               ENCODE_RESET_PLANE_AND_REGISTER;
2007               /* Rewind one byte because it is a character code of
2008                  composition elements.  */
2009               src--;
2010               coding->composing = COMPOSING_NO_RULE_HEAD;
2011               ENCODE_COMPOSITION_NO_RULE_START;
2012               coding->consumed_char++;
2013             }
2014           break;
2015
2016         case EMACS_invalid_code:
2017           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2018             ENCODE_RESET_PLANE_AND_REGISTER;
2019           *dst++ = c1;
2020           coding->consumed_char++;
2021           break;
2022         }
2023       continue;
2024     label_end_of_loop:
2025       result = CODING_FINISH_INSUFFICIENT_SRC;
2026       src = src_base;
2027       break;
2028     }
2029
2030   if (src < src_end && result == CODING_FINISH_NORMAL)
2031     result = CODING_FINISH_INSUFFICIENT_DST;
2032
2033   /* If this is the last block of the text to be encoded, we must
2034      reset graphic planes and registers to the initial state, and
2035      flush out the carryover if any.  */
2036   if (coding->mode & CODING_MODE_LAST_BLOCK)
2037     {
2038       ENCODE_RESET_PLANE_AND_REGISTER;
2039       if (COMPOSING_P (coding->composing))
2040         ENCODE_COMPOSITION_END;
2041       if (result == CODING_FINISH_INSUFFICIENT_SRC)
2042         {
2043           while (src < src_end && dst < dst_end)
2044             *dst++ = *src++;
2045         }
2046     }
2047   coding->consumed = src - source;
2048   coding->produced = coding->produced_char = dst - destination;
2049   return result;
2050 }
2051
2052 \f
2053 /*** 4. SJIS and BIG5 handlers ***/
2054
2055 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2056    quite widely.  So, for the moment, Emacs supports them in the bare
2057    C code.  But, in the future, they may be supported only by CCL.  */
2058
2059 /* SJIS is a coding system encoding three character sets: ASCII, right
2060    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2061    as is.  A character of charset katakana-jisx0201 is encoded by
2062    "position-code + 0x80".  A character of charset japanese-jisx0208
2063    is encoded in 2-byte but two position-codes are divided and shifted
2064    so that it fit in the range below.
2065
2066    --- CODE RANGE of SJIS ---
2067    (character set)      (range)
2068    ASCII                0x00 .. 0x7F
2069    KATAKANA-JISX0201    0xA0 .. 0xDF
2070    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2071             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2072    -------------------------------
2073
2074 */
2075
2076 /* BIG5 is a coding system encoding two character sets: ASCII and
2077    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2078    character set and is encoded in two-byte.
2079
2080    --- CODE RANGE of BIG5 ---
2081    (character set)      (range)
2082    ASCII                0x00 .. 0x7F
2083    Big5 (1st byte)      0xA1 .. 0xFE
2084         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2085    --------------------------
2086
2087    Since the number of characters in Big5 is larger than maximum
2088    characters in Emacs' charset (96x96), it can't be handled as one
2089    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2090    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2091    contains frequently used characters and the latter contains less
2092    frequently used characters.  */
2093
2094 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2095    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2096    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2097    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2098
2099 /* Number of Big5 characters which have the same code in 1st byte.  */
2100 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2101
2102 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2103   do {                                                                  \
2104     unsigned int temp                                                   \
2105       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2106     if (b1 < 0xC9)                                                      \
2107       charset = charset_big5_1;                                         \
2108     else                                                                \
2109       {                                                                 \
2110         charset = charset_big5_2;                                       \
2111         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2112       }                                                                 \
2113     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2114     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2115   } while (0)
2116
2117 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2118   do {                                                                  \
2119     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2120     if (charset == charset_big5_2)                                      \
2121       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2122     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2123     b2 = temp % BIG5_SAME_ROW;                                          \
2124     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2125   } while (0)
2126
2127 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
2128   do {                                                                  \
2129     int c_alt, charset_alt = (charset);                                 \
2130     if (!NILP (translation_table)                                       \
2131         && ((c_alt = translate_char (translation_table,                 \
2132                                      -1, (charset), c1, c2)) >= 0))     \
2133       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                          \
2134     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
2135       DECODE_CHARACTER_ASCII (c1);                                      \
2136     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
2137       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
2138     else                                                                \
2139       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
2140   } while (0)
2141
2142 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)             \
2143   do {                                                          \
2144     int c_alt, charset_alt;                                     \
2145     if (!NILP (translation_table)                               \
2146         && ((c_alt = translate_char (translation_table, -1,     \
2147                                      charset, c1, c2))          \
2148             >= 0))                                              \
2149       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                  \
2150     else                                                        \
2151       charset_alt = charset;                                    \
2152     if (charset_alt == charset_ascii)                           \
2153       *dst++ = c1;                                              \
2154     else if (CHARSET_DIMENSION (charset_alt) == 1)              \
2155       {                                                         \
2156         if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2157           *dst++ = c1;                                          \
2158         else if (sjis_p && charset_alt == charset_latin_jisx0201) \
2159           *dst++ = c1 & 0x7F;                                   \
2160         else                                                    \
2161           {                                                     \
2162             *dst++ = charset_alt, *dst++ = c1;                  \
2163             coding->fake_multibyte = 1;                         \
2164           }                                                     \
2165       }                                                         \
2166     else                                                        \
2167       {                                                         \
2168         c1 &= 0x7F, c2 &= 0x7F;                                 \
2169         if (sjis_p && (charset_alt == charset_jisx0208          \
2170                        || charset_alt == charset_jisx0208_1978))\
2171           {                                                     \
2172             unsigned char s1, s2;                               \
2173                                                                 \
2174             ENCODE_SJIS (c1, c2, s1, s2);                       \
2175             *dst++ = s1, *dst++ = s2;                           \
2176             coding->fake_multibyte = 1;                         \
2177           }                                                     \
2178         else if (!sjis_p                                        \
2179                  && (charset_alt == charset_big5_1              \
2180                      || charset_alt == charset_big5_2))         \
2181           {                                                     \
2182             unsigned char b1, b2;                               \
2183                                                                 \
2184             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);          \
2185             *dst++ = b1, *dst++ = b2;                           \
2186           }                                                     \
2187         else                                                    \
2188           {                                                     \
2189             *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;     \
2190             coding->fake_multibyte = 1;                         \
2191           }                                                     \
2192       }                                                         \
2193     coding->consumed_char++;                                    \
2194   } while (0);
2195
2196 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2197    Check if a text is encoded in SJIS.  If it is, return
2198    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2199
2200 int
2201 detect_coding_sjis (src, src_end)
2202      unsigned char *src, *src_end;
2203 {
2204   unsigned char c;
2205
2206   while (src < src_end)
2207     {
2208       c = *src++;
2209       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2210         {
2211           if (src < src_end && *src++ < 0x40)
2212             return 0;
2213         }
2214     }
2215   return CODING_CATEGORY_MASK_SJIS;
2216 }
2217
2218 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2219    Check if a text is encoded in BIG5.  If it is, return
2220    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2221
2222 int
2223 detect_coding_big5 (src, src_end)
2224      unsigned char *src, *src_end;
2225 {
2226   unsigned char c;
2227
2228   while (src < src_end)
2229     {
2230       c = *src++;
2231       if (c >= 0xA1)
2232         {
2233           if (src >= src_end)
2234             break;
2235           c = *src++;
2236           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2237             return 0;
2238         }
2239     }
2240   return CODING_CATEGORY_MASK_BIG5;
2241 }
2242
2243 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2244    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2245
2246 int
2247 decode_coding_sjis_big5 (coding, source, destination,
2248                          src_bytes, dst_bytes, sjis_p)
2249      struct coding_system *coding;
2250      unsigned char *source, *destination;
2251      int src_bytes, dst_bytes;
2252      int sjis_p;
2253 {
2254   unsigned char *src = source;
2255   unsigned char *src_end = source + src_bytes;
2256   unsigned char *dst = destination;
2257   unsigned char *dst_end = destination + dst_bytes;
2258   /* Since the maximum bytes produced by each loop is 4, we subtract 3
2259      from DST_END to assure overflow checking is necessary only at the
2260      head of loop.  */
2261   unsigned char *adjusted_dst_end = dst_end - 3;
2262   Lisp_Object translation_table
2263       = coding->translation_table_for_decode;
2264   int result = CODING_FINISH_NORMAL;
2265
2266   if (!NILP (Venable_character_translation) && NILP (translation_table))
2267     translation_table = Vstandard_translation_table_for_decode;
2268
2269   coding->produced_char = 0;
2270   coding->fake_multibyte = 0;
2271   while (src < src_end && (dst_bytes
2272                            ? (dst < adjusted_dst_end)
2273                            : (dst < src - 3)))
2274     {
2275       /* SRC_BASE remembers the start position in source in each loop.
2276          The loop will be exited when there's not enough source text
2277          to analyze two-byte character (within macro ONE_MORE_BYTE).
2278          In that case, SRC is reset to SRC_BASE before exiting.  */
2279       unsigned char *src_base = src;
2280       unsigned char c1 = *src++, c2, c3, c4;
2281
2282       if (c1 < 0x20)
2283         {
2284           if (c1 == '\r')
2285             {
2286               if (coding->eol_type == CODING_EOL_CRLF)
2287                 {
2288                   ONE_MORE_BYTE (c2);
2289                   if (c2 == '\n')
2290                     *dst++ = c2;
2291                   else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2292                     {
2293                       result = CODING_FINISH_INCONSISTENT_EOL;
2294                       goto label_end_of_loop_2;
2295                     }
2296                   else
2297                     /* To process C2 again, SRC is subtracted by 1.  */
2298                     *dst++ = c1, src--;
2299                 }
2300               else if (coding->eol_type == CODING_EOL_CR)
2301                 *dst++ = '\n';
2302               else
2303                 *dst++ = c1;
2304             }
2305           else if (c1 == '\n'
2306                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2307                    && (coding->eol_type == CODING_EOL_CR
2308                        || coding->eol_type == CODING_EOL_CRLF))
2309             {
2310               result = CODING_FINISH_INCONSISTENT_EOL;
2311               goto label_end_of_loop_2;
2312             }
2313           else
2314             *dst++ = c1;
2315           coding->produced_char++;
2316         }
2317       else if (c1 < 0x80)
2318         {
2319           c2 = 0;               /* avoid warning */
2320           DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2321         }
2322       else
2323         {
2324           if (sjis_p)
2325             {
2326               if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
2327                 {
2328                   /* SJIS -> JISX0208 */
2329                   ONE_MORE_BYTE (c2);
2330                   if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
2331                     {
2332                       DECODE_SJIS (c1, c2, c3, c4);
2333                       DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2334                     }
2335                   else
2336                     goto label_invalid_code_2;
2337                 }
2338               else if (c1 < 0xE0)
2339                 /* SJIS -> JISX0201-Kana */
2340                 {
2341                   c2 = 0;       /* avoid warning */
2342                   DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2343                                               /* dummy */ c2);
2344                 }
2345               else
2346                 goto label_invalid_code_1;
2347             }
2348           else
2349             {
2350               /* BIG5 -> Big5 */
2351               if (c1 >= 0xA1 && c1 <= 0xFE)
2352                 {
2353                   ONE_MORE_BYTE (c2);
2354                   if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2355                     {
2356                       int charset;
2357
2358                       DECODE_BIG5 (c1, c2, charset, c3, c4);
2359                       DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2360                     }
2361                   else
2362                     goto label_invalid_code_2;
2363                 }
2364               else
2365                 goto label_invalid_code_1;
2366             }
2367         }
2368       continue;
2369
2370     label_invalid_code_1:
2371       *dst++ = c1;
2372       coding->produced_char++;
2373       coding->fake_multibyte = 1;
2374       continue;
2375
2376     label_invalid_code_2:
2377       *dst++ = c1; *dst++= c2;
2378       coding->produced_char += 2;
2379       coding->fake_multibyte = 1;
2380       continue;
2381
2382     label_end_of_loop:
2383       result = CODING_FINISH_INSUFFICIENT_SRC;
2384     label_end_of_loop_2:
2385       src = src_base;
2386       break;
2387     }
2388
2389   if (src < src_end)
2390     {
2391       if (result == CODING_FINISH_NORMAL)
2392         result = CODING_FINISH_INSUFFICIENT_DST;
2393       else if (result != CODING_FINISH_INCONSISTENT_EOL
2394                && coding->mode & CODING_MODE_LAST_BLOCK)
2395         {
2396           src_bytes = src_end - src;
2397           if (dst_bytes && (dst_end - dst < src_bytes))
2398             src_bytes = dst_end - dst;
2399           bcopy (dst, src, src_bytes);
2400           src += src_bytes;
2401           dst += src_bytes;
2402           coding->fake_multibyte = 1;
2403         }
2404     }
2405
2406   coding->consumed = coding->consumed_char = src - source;
2407   coding->produced = dst - destination;
2408   return result;
2409 }
2410
2411 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2412    This function can encode `charset_ascii', `charset_katakana_jisx0201',
2413    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
2414    sure that all these charsets are registered as official charset
2415    (i.e. do not have extended leading-codes).  Characters of other
2416    charsets are produced without any encoding.  If SJIS_P is 1, encode
2417    SJIS text, else encode BIG5 text.  */
2418
2419 int
2420 encode_coding_sjis_big5 (coding, source, destination,
2421                          src_bytes, dst_bytes, sjis_p)
2422      struct coding_system *coding;
2423      unsigned char *source, *destination;
2424      int src_bytes, dst_bytes;
2425      int sjis_p;
2426 {
2427   unsigned char *src = source;
2428   unsigned char *src_end = source + src_bytes;
2429   unsigned char *dst = destination;
2430   unsigned char *dst_end = destination + dst_bytes;
2431   /* Since the maximum bytes produced by each loop is 2, we subtract 1
2432      from DST_END to assure overflow checking is necessary only at the
2433      head of loop.  */
2434   unsigned char *adjusted_dst_end = dst_end - 1;
2435   Lisp_Object translation_table
2436       = coding->translation_table_for_encode;
2437   int result = CODING_FINISH_NORMAL;
2438
2439   if (!NILP (Venable_character_translation) && NILP (translation_table))
2440     translation_table = Vstandard_translation_table_for_encode;
2441
2442   coding->consumed_char = 0;
2443   coding->fake_multibyte = 0;
2444   while (src < src_end && (dst_bytes
2445                            ? (dst < adjusted_dst_end)
2446                            : (dst < src - 1)))
2447     {
2448       /* SRC_BASE remembers the start position in source in each loop.
2449          The loop will be exited when there's not enough source text
2450          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2451          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2452          before exiting.  */
2453       unsigned char *src_base = src;
2454       unsigned char c1 = *src++, c2, c3, c4;
2455
2456       if (coding->composing)
2457         {
2458           if (c1 == 0xA0)
2459             {
2460               ONE_MORE_BYTE (c1);
2461               c1 &= 0x7F;
2462             }
2463           else if (c1 >= 0xA0)
2464             c1 -= 0x20;
2465           else
2466             coding->composing = 0;
2467         }
2468
2469       switch (emacs_code_class[c1])
2470         {
2471         case EMACS_ascii_code:
2472           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2473           break;
2474
2475         case EMACS_control_code:
2476           *dst++ = c1;
2477           coding->consumed_char++;
2478           break;
2479
2480         case EMACS_carriage_return_code:
2481           if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2482             {
2483               *dst++ = c1;
2484               coding->consumed_char++;
2485               break;
2486             }
2487           /* fall down to treat '\r' as '\n' ...  */
2488
2489         case EMACS_linefeed_code:
2490           if (coding->eol_type == CODING_EOL_LF
2491               || coding->eol_type == CODING_EOL_UNDECIDED)
2492             *dst++ = '\n';
2493           else if (coding->eol_type == CODING_EOL_CRLF)
2494             *dst++ = '\r', *dst++ = '\n';
2495           else
2496             *dst++ = '\r';
2497           coding->consumed_char++;
2498           break;
2499
2500         case EMACS_leading_code_2:
2501           ONE_MORE_BYTE (c2);
2502           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2503           break;
2504
2505         case EMACS_leading_code_3:
2506           TWO_MORE_BYTES (c2, c3);
2507           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2508           break;
2509
2510         case EMACS_leading_code_4:
2511           THREE_MORE_BYTES (c2, c3, c4);
2512           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2513           break;
2514
2515         case EMACS_leading_code_composition:
2516           coding->composing = 1;
2517           break;
2518
2519         default:                /* i.e. case EMACS_invalid_code: */
2520           *dst++ = c1;
2521           coding->consumed_char++;
2522         }
2523       continue;
2524
2525     label_end_of_loop:
2526       result = CODING_FINISH_INSUFFICIENT_SRC;
2527       src = src_base;
2528       break;
2529     }
2530
2531   if (result == CODING_FINISH_NORMAL
2532       && src < src_end)
2533     result = CODING_FINISH_INSUFFICIENT_DST;
2534   coding->consumed = src - source;
2535   coding->produced = coding->produced_char = dst - destination;
2536   return result;
2537 }
2538
2539 \f
2540 /*** 5. CCL handlers ***/
2541
2542 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2543    Check if a text is encoded in a coding system of which
2544    encoder/decoder are written in CCL program.  If it is, return
2545    CODING_CATEGORY_MASK_CCL, else return 0.  */
2546
2547 int
2548 detect_coding_ccl (src, src_end)
2549      unsigned char *src, *src_end;
2550 {
2551   unsigned char *valid;
2552
2553   /* No coding system is assigned to coding-category-ccl.  */
2554   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2555     return 0;
2556
2557   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2558   while (src < src_end)
2559     {
2560       if (! valid[*src]) return 0;
2561       src++;
2562     }
2563   return CODING_CATEGORY_MASK_CCL;
2564 }
2565
2566 \f
2567 /*** 6. End-of-line handlers ***/
2568
2569 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2570    This function is called only when `coding->eol_type' is
2571    CODING_EOL_CRLF or CODING_EOL_CR.  */
2572
2573 int
2574 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2575      struct coding_system *coding;
2576      unsigned char *source, *destination;
2577      int src_bytes, dst_bytes;
2578 {
2579   unsigned char *src = source;
2580   unsigned char *src_end = source + src_bytes;
2581   unsigned char *dst = destination;
2582   unsigned char *dst_end = destination + dst_bytes;
2583   unsigned char c;
2584   int result = CODING_FINISH_NORMAL;
2585
2586   coding->fake_multibyte = 0;
2587
2588   if (src_bytes <= 0)
2589     {
2590       coding->produced = coding->produced_char = 0;
2591       coding->consumed = coding->consumed_char = 0;
2592       return result;
2593     }
2594
2595   switch (coding->eol_type)
2596     {
2597     case CODING_EOL_CRLF:
2598       {
2599         /* Since the maximum bytes produced by each loop is 2, we
2600            subtract 1 from DST_END to assure overflow checking is
2601            necessary only at the head of loop.  */
2602         unsigned char *adjusted_dst_end = dst_end - 1;
2603
2604         while (src < src_end && (dst_bytes
2605                                  ? (dst < adjusted_dst_end)
2606                                  : (dst < src - 1)))
2607           {
2608             unsigned char *src_base = src;
2609
2610             c = *src++;
2611             if (c == '\r')
2612               {
2613                 ONE_MORE_BYTE (c);
2614                 if (c == '\n')
2615                   *dst++ = c;
2616                 else
2617                   {
2618                     if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2619                       {
2620                         result = CODING_FINISH_INCONSISTENT_EOL;
2621                         goto label_end_of_loop_2;
2622                       }
2623                     src--;
2624                     *dst++ = '\r';
2625                     if (BASE_LEADING_CODE_P (c))
2626                       coding->fake_multibyte = 1;
2627                   }
2628               }
2629             else if (c == '\n'
2630                      && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2631               {
2632                 result = CODING_FINISH_INCONSISTENT_EOL;
2633                 goto label_end_of_loop_2;
2634               }
2635             else
2636               {
2637                 *dst++ = c;
2638                 if (BASE_LEADING_CODE_P (c))
2639                   coding->fake_multibyte = 1;
2640               }
2641             continue;
2642
2643           label_end_of_loop:
2644             result = CODING_FINISH_INSUFFICIENT_SRC;
2645           label_end_of_loop_2:
2646             src = src_base;
2647             break;
2648           }
2649         if (src < src_end)
2650           {
2651             if (result == CODING_FINISH_NORMAL)
2652               result = CODING_FINISH_INSUFFICIENT_DST;
2653             else if (result != CODING_FINISH_INCONSISTENT_EOL
2654                      && coding->mode & CODING_MODE_LAST_BLOCK)
2655               {
2656                 /* This is the last block of the text to be decoded.
2657                    We flush out all remaining codes.  */
2658                 src_bytes = src_end - src;
2659                 if (dst_bytes && (dst_end - dst < src_bytes))
2660                   src_bytes = dst_end - dst;
2661                 bcopy (src, dst, src_bytes);
2662                 dst += src_bytes;
2663                 src += src_bytes;
2664               }
2665           }
2666       }
2667       break;
2668
2669     case CODING_EOL_CR:
2670       if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2671         {
2672           while (src < src_end)
2673             {
2674               if ((c = *src++) == '\n')
2675                 break;
2676               if (BASE_LEADING_CODE_P (c))
2677                 coding->fake_multibyte = 1;
2678             }
2679           if (*--src == '\n')
2680             {
2681               src_bytes = src - source;
2682               result = CODING_FINISH_INCONSISTENT_EOL;
2683             }
2684         }
2685       if (dst_bytes && src_bytes > dst_bytes)
2686         {
2687           result = CODING_FINISH_INSUFFICIENT_DST;
2688           src_bytes = dst_bytes;
2689         }
2690       if (dst_bytes)
2691         bcopy (source, destination, src_bytes);
2692       else
2693         safe_bcopy (source, destination, src_bytes);
2694       src = source + src_bytes;
2695       while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
2696       break;
2697
2698     default:                    /* i.e. case: CODING_EOL_LF */
2699       if (dst_bytes && src_bytes > dst_bytes)
2700         {
2701           result = CODING_FINISH_INSUFFICIENT_DST;
2702           src_bytes = dst_bytes;
2703         }
2704       if (dst_bytes)
2705         bcopy (source, destination, src_bytes);
2706       else
2707         safe_bcopy (source, destination, src_bytes);
2708       src += src_bytes;
2709       dst += src_bytes;
2710       coding->fake_multibyte = 1;
2711       break;
2712     }
2713
2714   coding->consumed = coding->consumed_char = src - source;
2715   coding->produced = coding->produced_char = dst - destination;
2716   return result;
2717 }
2718
2719 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2720    format of end-of-line according to `coding->eol_type'.  If
2721    `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2722    '\r' in source text also means end-of-line.  */
2723
2724 int
2725 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2726      struct coding_system *coding;
2727      unsigned char *source, *destination;
2728      int src_bytes, dst_bytes;
2729 {
2730   unsigned char *src = source;
2731   unsigned char *dst = destination;
2732   int result = CODING_FINISH_NORMAL;
2733
2734   coding->fake_multibyte = 0;
2735
2736   if (coding->eol_type == CODING_EOL_CRLF)
2737     {
2738       unsigned char c;
2739       unsigned char *src_end = source + src_bytes;
2740       unsigned char *dst_end = destination + dst_bytes;
2741       /* Since the maximum bytes produced by each loop is 2, we
2742          subtract 1 from DST_END to assure overflow checking is
2743          necessary only at the head of loop.  */
2744       unsigned char *adjusted_dst_end = dst_end - 1;
2745
2746       while (src < src_end && (dst_bytes
2747                                ? (dst < adjusted_dst_end)
2748                                : (dst < src - 1)))
2749         {
2750           c = *src++;
2751           if (c == '\n'
2752               || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2753             *dst++ = '\r', *dst++ = '\n';
2754           else
2755             {
2756               *dst++ = c;
2757               if (BASE_LEADING_CODE_P (c))
2758                 coding->fake_multibyte = 1;
2759             }
2760         }
2761       if (src < src_end)
2762         result = CODING_FINISH_INSUFFICIENT_DST;
2763     }
2764   else
2765     {
2766       unsigned char c;
2767
2768       if (dst_bytes && src_bytes > dst_bytes)
2769         {
2770           src_bytes = dst_bytes;
2771           result = CODING_FINISH_INSUFFICIENT_DST;
2772         }
2773       if (dst_bytes)
2774         bcopy (source, destination, src_bytes);
2775       else
2776         safe_bcopy (source, destination, src_bytes);
2777       dst_bytes = src_bytes;
2778       if (coding->eol_type == CODING_EOL_CR)
2779         {
2780           while (src_bytes--)
2781             {
2782               if ((c = *dst++) == '\n')
2783                 dst[-1] = '\r';
2784               else if (BASE_LEADING_CODE_P (c))
2785                 coding->fake_multibyte = 1;
2786             }
2787         }
2788       else
2789         {
2790           if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2791             {
2792               while (src_bytes--)
2793                 if (*dst++ == '\r') dst[-1] = '\n';
2794             }
2795           coding->fake_multibyte = 1;
2796         }
2797       src = source + dst_bytes;
2798       dst = destination + dst_bytes;
2799     }
2800
2801   coding->consumed = coding->consumed_char = src - source;
2802   coding->produced = coding->produced_char = dst - destination;
2803   return result;
2804 }
2805
2806 \f
2807 /*** 7. C library functions ***/
2808
2809 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2810    has a property `coding-system'.  The value of this property is a
2811    vector of length 5 (called as coding-vector).  Among elements of
2812    this vector, the first (element[0]) and the fifth (element[4])
2813    carry important information for decoding/encoding.  Before
2814    decoding/encoding, this information should be set in fields of a
2815    structure of type `coding_system'.
2816
2817    A value of property `coding-system' can be a symbol of another
2818    subsidiary coding-system.  In that case, Emacs gets coding-vector
2819    from that symbol.
2820
2821    `element[0]' contains information to be set in `coding->type'.  The
2822    value and its meaning is as follows:
2823
2824    0 -- coding_type_emacs_mule
2825    1 -- coding_type_sjis
2826    2 -- coding_type_iso2022
2827    3 -- coding_type_big5
2828    4 -- coding_type_ccl encoder/decoder written in CCL
2829    nil -- coding_type_no_conversion
2830    t -- coding_type_undecided (automatic conversion on decoding,
2831                                no-conversion on encoding)
2832
2833    `element[4]' contains information to be set in `coding->flags' and
2834    `coding->spec'.  The meaning varies by `coding->type'.
2835
2836    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2837    of length 32 (of which the first 13 sub-elements are used now).
2838    Meanings of these sub-elements are:
2839
2840    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2841         If the value is an integer of valid charset, the charset is
2842         assumed to be designated to graphic register N initially.
2843
2844         If the value is minus, it is a minus value of charset which
2845         reserves graphic register N, which means that the charset is
2846         not designated initially but should be designated to graphic
2847         register N just before encoding a character in that charset.
2848
2849         If the value is nil, graphic register N is never used on
2850         encoding.
2851
2852    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2853         Each value takes t or nil.  See the section ISO2022 of
2854         `coding.h' for more information.
2855
2856    If `coding->type' is `coding_type_big5', element[4] is t to denote
2857    BIG5-ETen or nil to denote BIG5-HKU.
2858
2859    If `coding->type' takes the other value, element[4] is ignored.
2860
2861    Emacs Lisp's coding system also carries information about format of
2862    end-of-line in a value of property `eol-type'.  If the value is
2863    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2864    means CODING_EOL_CR.  If it is not integer, it should be a vector
2865    of subsidiary coding systems of which property `eol-type' has one
2866    of above values.
2867
2868 */
2869
2870 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2871    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2872    is setup so that no conversion is necessary and return -1, else
2873    return 0.  */
2874
2875 int
2876 setup_coding_system (coding_system, coding)
2877      Lisp_Object coding_system;
2878      struct coding_system *coding;
2879 {
2880   Lisp_Object coding_spec, coding_type, eol_type, plist;
2881   Lisp_Object val;
2882   int i;
2883
2884   /* Initialize some fields required for all kinds of coding systems.  */
2885   coding->symbol = coding_system;
2886   coding->common_flags = 0;
2887   coding->mode = 0;
2888   coding->heading_ascii = -1;
2889   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2890
2891   if (NILP (coding_system))
2892     goto label_invalid_coding_system;
2893
2894   coding_spec = Fget (coding_system, Qcoding_system);
2895
2896   if (!VECTORP (coding_spec)
2897       || XVECTOR (coding_spec)->size != 5
2898       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2899     goto label_invalid_coding_system;
2900
2901   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2902   if (VECTORP (eol_type))
2903     {
2904       coding->eol_type = CODING_EOL_UNDECIDED;
2905       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2906     }
2907   else if (XFASTINT (eol_type) == 1)
2908     {
2909       coding->eol_type = CODING_EOL_CRLF;
2910       coding->common_flags
2911         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2912     }
2913   else if (XFASTINT (eol_type) == 2)
2914     {
2915       coding->eol_type = CODING_EOL_CR;
2916       coding->common_flags
2917         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2918     }
2919   else
2920     coding->eol_type = CODING_EOL_LF;
2921
2922   coding_type = XVECTOR (coding_spec)->contents[0];
2923   /* Try short cut.  */
2924   if (SYMBOLP (coding_type))
2925     {
2926       if (EQ (coding_type, Qt))
2927         {
2928           coding->type = coding_type_undecided;
2929           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2930         }
2931       else
2932         coding->type = coding_type_no_conversion;
2933       return 0;
2934     }
2935
2936   /* Initialize remaining fields.  */
2937   coding->composing = 0;
2938   coding->composed_chars = 0;
2939
2940   /* Get values of coding system properties:
2941      `post-read-conversion', `pre-write-conversion',
2942      `translation-table-for-decode', `translation-table-for-encode'.  */
2943   plist = XVECTOR (coding_spec)->contents[3];
2944   coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2945   coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2946   val = Fplist_get (plist, Qtranslation_table_for_decode);
2947   if (SYMBOLP (val))
2948     val = Fget (val, Qtranslation_table_for_decode);
2949   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2950   val = Fplist_get (plist, Qtranslation_table_for_encode);
2951   if (SYMBOLP (val))
2952     val = Fget (val, Qtranslation_table_for_encode);
2953   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2954   val = Fplist_get (plist, Qcoding_category);
2955   if (!NILP (val))
2956     {
2957       val = Fget (val, Qcoding_category_index);
2958       if (INTEGERP (val))
2959         coding->category_idx = XINT (val);
2960       else
2961         goto label_invalid_coding_system;
2962     }
2963   else
2964     goto label_invalid_coding_system;
2965
2966   val = Fplist_get (plist, Qsafe_charsets);
2967   if (EQ (val, Qt))
2968     {
2969       for (i = 0; i <= MAX_CHARSET; i++)
2970         coding->safe_charsets[i] = 1;
2971     }
2972   else
2973     {
2974       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2975       while (CONSP (val))
2976         {
2977           if ((i = get_charset_id (XCAR (val))) >= 0)
2978             coding->safe_charsets[i] = 1;
2979           val = XCDR (val);
2980         }
2981     }
2982
2983   switch (XFASTINT (coding_type))
2984     {
2985     case 0:
2986       coding->type = coding_type_emacs_mule;
2987       if (!NILP (coding->post_read_conversion))
2988         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2989       if (!NILP (coding->pre_write_conversion))
2990         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2991       break;
2992
2993     case 1:
2994       coding->type = coding_type_sjis;
2995       coding->common_flags
2996         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2997       break;
2998
2999     case 2:
3000       coding->type = coding_type_iso2022;
3001       coding->common_flags
3002         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3003       {
3004         Lisp_Object val, temp;
3005         Lisp_Object *flags;
3006         int i, charset, reg_bits = 0;
3007
3008         val = XVECTOR (coding_spec)->contents[4];
3009
3010         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3011           goto label_invalid_coding_system;
3012
3013         flags = XVECTOR (val)->contents;
3014         coding->flags
3015           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3016              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3017              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3018              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3019              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3020              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3021              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3022              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3023              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3024              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3025              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3026              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3027              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3028              );
3029
3030         /* Invoke graphic register 0 to plane 0.  */
3031         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3032         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3033         CODING_SPEC_ISO_INVOCATION (coding, 1)
3034           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3035         /* Not single shifting at first.  */
3036         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3037         /* Beginning of buffer should also be regarded as bol. */
3038         CODING_SPEC_ISO_BOL (coding) = 1;
3039
3040         for (charset = 0; charset <= MAX_CHARSET; charset++)
3041           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3042         val = Vcharset_revision_alist;
3043         while (CONSP (val))
3044           {
3045             charset = get_charset_id (Fcar_safe (XCAR (val)));
3046             if (charset >= 0
3047                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3048                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3049               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3050             val = XCDR (val);
3051           }
3052
3053         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3054            FLAGS[REG] can be one of below:
3055                 integer CHARSET: CHARSET occupies register I,
3056                 t: designate nothing to REG initially, but can be used
3057                   by any charsets,
3058                 list of integer, nil, or t: designate the first
3059                   element (if integer) to REG initially, the remaining
3060                   elements (if integer) is designated to REG on request,
3061                   if an element is t, REG can be used by any charsets,
3062                 nil: REG is never used.  */
3063         for (charset = 0; charset <= MAX_CHARSET; charset++)
3064           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3065             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3066         for (i = 0; i < 4; i++)
3067           {
3068             if (INTEGERP (flags[i])
3069                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3070                 || (charset = get_charset_id (flags[i])) >= 0)
3071               {
3072                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3073                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3074               }
3075             else if (EQ (flags[i], Qt))
3076               {
3077                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3078                 reg_bits |= 1 << i;
3079                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3080               }
3081             else if (CONSP (flags[i]))
3082               {
3083                 Lisp_Object tail;
3084                 tail = flags[i];
3085
3086                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3087                 if (INTEGERP (XCAR (tail))
3088                     && (charset = XINT (XCAR (tail)),
3089                         CHARSET_VALID_P (charset))
3090                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3091                   {
3092                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3093                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3094                   }
3095                 else
3096                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3097                 tail = XCDR (tail);
3098                 while (CONSP (tail))
3099                   {
3100                     if (INTEGERP (XCAR (tail))
3101                         && (charset = XINT (XCAR (tail)),
3102                             CHARSET_VALID_P (charset))
3103                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3104                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3105                         = i;
3106                     else if (EQ (XCAR (tail), Qt))
3107                       reg_bits |= 1 << i;
3108                     tail = XCDR (tail);
3109                   }
3110               }
3111             else
3112               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3113
3114             CODING_SPEC_ISO_DESIGNATION (coding, i)
3115               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3116           }
3117
3118         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3119           {
3120             /* REG 1 can be used only by locking shift in 7-bit env.  */
3121             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3122               reg_bits &= ~2;
3123             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3124               /* Without any shifting, only REG 0 and 1 can be used.  */
3125               reg_bits &= 3;
3126           }
3127
3128         if (reg_bits)
3129           for (charset = 0; charset <= MAX_CHARSET; charset++)
3130             {
3131               if (CHARSET_VALID_P (charset))
3132                 {
3133                   /* There exist some default graphic registers to be
3134                      used CHARSET.  */
3135
3136                   /* We had better avoid designating a charset of
3137                      CHARS96 to REG 0 as far as possible.  */
3138                   if (CHARSET_CHARS (charset) == 96)
3139                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3140                       = (reg_bits & 2
3141                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3142                   else
3143                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3144                       = (reg_bits & 1
3145                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3146                 }
3147             }
3148       }
3149       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3150       coding->spec.iso2022.last_invalid_designation_register = -1;
3151       break;
3152
3153     case 3:
3154       coding->type = coding_type_big5;
3155       coding->common_flags
3156         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3157       coding->flags
3158         = (NILP (XVECTOR (coding_spec)->contents[4])
3159            ? CODING_FLAG_BIG5_HKU
3160            : CODING_FLAG_BIG5_ETEN);
3161       break;
3162
3163     case 4:
3164       coding->type = coding_type_ccl;
3165       coding->common_flags
3166         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3167       {
3168         val = XVECTOR (coding_spec)->contents[4];
3169         if (! CONSP (val)
3170             || setup_ccl_program (&(coding->spec.ccl.decoder),
3171                                   XCAR (val)) < 0
3172             || setup_ccl_program (&(coding->spec.ccl.encoder),
3173                                   XCDR (val)) < 0)
3174           goto label_invalid_coding_system;
3175
3176         bzero (coding->spec.ccl.valid_codes, 256);
3177         val = Fplist_get (plist, Qvalid_codes);
3178         if (CONSP (val))
3179           {
3180             Lisp_Object this;
3181
3182             for (; CONSP (val); val = XCDR (val))
3183               {
3184                 this = XCAR (val);
3185                 if (INTEGERP (this)
3186                     && XINT (this) >= 0 && XINT (this) < 256)
3187                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3188                 else if (CONSP (this)
3189                          && INTEGERP (XCAR (this))
3190                          && INTEGERP (XCDR (this)))
3191                   {
3192                     int start = XINT (XCAR (this));
3193                     int end = XINT (XCDR (this));
3194
3195                     if (start >= 0 && start <= end && end < 256)
3196                       while (start <= end)
3197                         coding->spec.ccl.valid_codes[start++] = 1;
3198                   }
3199               }
3200           }
3201       }
3202       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3203       break;
3204
3205     case 5:
3206       coding->type = coding_type_raw_text;
3207       break;
3208
3209     default:
3210       goto label_invalid_coding_system;
3211     }
3212   return 0;
3213
3214  label_invalid_coding_system:
3215   coding->type = coding_type_no_conversion;
3216   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3217   coding->common_flags = 0;
3218   coding->eol_type = CODING_EOL_LF;
3219   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3220   return -1;
3221 }
3222
3223 /* Setup raw-text or one of its subsidiaries in the structure
3224    coding_system CODING according to the already setup value eol_type
3225    in CODING.  CODING should be setup for some coding system in
3226    advance.  */
3227
3228 void
3229 setup_raw_text_coding_system (coding)
3230      struct coding_system *coding;
3231 {
3232   if (coding->type != coding_type_raw_text)
3233     {
3234       coding->symbol = Qraw_text;
3235       coding->type = coding_type_raw_text;
3236       if (coding->eol_type != CODING_EOL_UNDECIDED)
3237         {
3238           Lisp_Object subsidiaries;
3239           subsidiaries = Fget (Qraw_text, Qeol_type);
3240
3241           if (VECTORP (subsidiaries)
3242               && XVECTOR (subsidiaries)->size == 3)
3243             coding->symbol
3244               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3245         }
3246       setup_coding_system (coding->symbol, coding);
3247     }
3248   return;
3249 }
3250
3251 /* Emacs has a mechanism to automatically detect a coding system if it
3252    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3253    it's impossible to distinguish some coding systems accurately
3254    because they use the same range of codes.  So, at first, coding
3255    systems are categorized into 7, those are:
3256
3257    o coding-category-emacs-mule
3258
3259         The category for a coding system which has the same code range
3260         as Emacs' internal format.  Assigned the coding-system (Lisp
3261         symbol) `emacs-mule' by default.
3262
3263    o coding-category-sjis
3264
3265         The category for a coding system which has the same code range
3266         as SJIS.  Assigned the coding-system (Lisp
3267         symbol) `japanese-shift-jis' by default.
3268
3269    o coding-category-iso-7
3270
3271         The category for a coding system which has the same code range
3272         as ISO2022 of 7-bit environment.  This doesn't use any locking
3273         shift and single shift functions.  This can encode/decode all
3274         charsets.  Assigned the coding-system (Lisp symbol)
3275         `iso-2022-7bit' by default.
3276
3277    o coding-category-iso-7-tight
3278
3279         Same as coding-category-iso-7 except that this can
3280         encode/decode only the specified charsets.
3281
3282    o coding-category-iso-8-1
3283
3284         The category for a coding system which has the same code range
3285         as ISO2022 of 8-bit environment and graphic plane 1 used only
3286         for DIMENSION1 charset.  This doesn't use any locking shift
3287         and single shift functions.  Assigned the coding-system (Lisp
3288         symbol) `iso-latin-1' by default.
3289
3290    o coding-category-iso-8-2
3291
3292         The category for a coding system which has the same code range
3293         as ISO2022 of 8-bit environment and graphic plane 1 used only
3294         for DIMENSION2 charset.  This doesn't use any locking shift
3295         and single shift functions.  Assigned the coding-system (Lisp
3296         symbol) `japanese-iso-8bit' by default.
3297
3298    o coding-category-iso-7-else
3299
3300         The category for a coding system which has the same code range
3301         as ISO2022 of 7-bit environemnt but uses locking shift or
3302         single shift functions.  Assigned the coding-system (Lisp
3303         symbol) `iso-2022-7bit-lock' by default.
3304
3305    o coding-category-iso-8-else
3306
3307         The category for a coding system which has the same code range
3308         as ISO2022 of 8-bit environemnt but uses locking shift or
3309         single shift functions.  Assigned the coding-system (Lisp
3310         symbol) `iso-2022-8bit-ss2' by default.
3311
3312    o coding-category-big5
3313
3314         The category for a coding system which has the same code range
3315         as BIG5.  Assigned the coding-system (Lisp symbol)
3316         `cn-big5' by default.
3317
3318    o coding-category-ccl
3319
3320         The category for a coding system of which encoder/decoder is
3321         written in CCL programs.  The default value is nil, i.e., no
3322         coding system is assigned.
3323
3324    o coding-category-binary
3325
3326         The category for a coding system not categorized in any of the
3327         above.  Assigned the coding-system (Lisp symbol)
3328         `no-conversion' by default.
3329
3330    Each of them is a Lisp symbol and the value is an actual
3331    `coding-system's (this is also a Lisp symbol) assigned by a user.
3332    What Emacs does actually is to detect a category of coding system.
3333    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3334    decide only one possible category, it selects a category of the
3335    highest priority.  Priorities of categories are also specified by a
3336    user in a Lisp variable `coding-category-list'.
3337
3338 */
3339
3340 static
3341 int ascii_skip_code[256];
3342
3343 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3344    If it detects possible coding systems, return an integer in which
3345    appropriate flag bits are set.  Flag bits are defined by macros
3346    CODING_CATEGORY_MASK_XXX in `coding.h'.
3347
3348    How many ASCII characters are at the head is returned as *SKIP.  */
3349
3350 static int
3351 detect_coding_mask (source, src_bytes, priorities, skip)
3352      unsigned char *source;
3353      int src_bytes, *priorities, *skip;
3354 {
3355   register unsigned char c;
3356   unsigned char *src = source, *src_end = source + src_bytes;
3357   unsigned int mask;
3358   int i;
3359
3360   /* At first, skip all ASCII characters and control characters except
3361      for three ISO2022 specific control characters.  */
3362   ascii_skip_code[ISO_CODE_SO] = 0;
3363   ascii_skip_code[ISO_CODE_SI] = 0;
3364   ascii_skip_code[ISO_CODE_ESC] = 0;
3365
3366  label_loop_detect_coding:
3367   while (src < src_end && ascii_skip_code[*src]) src++;
3368   *skip = src - source;
3369
3370   if (src >= src_end)
3371     /* We found nothing other than ASCII.  There's nothing to do.  */
3372     return 0;
3373
3374   c = *src;
3375   /* The text seems to be encoded in some multilingual coding system.
3376      Now, try to find in which coding system the text is encoded.  */
3377   if (c < 0x80)
3378     {
3379       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3380       /* C is an ISO2022 specific control code of C0.  */
3381       mask = detect_coding_iso2022 (src, src_end);
3382       if (mask == 0)
3383         {
3384           /* No valid ISO2022 code follows C.  Try again.  */
3385           src++;
3386           if (c == ISO_CODE_ESC)
3387             ascii_skip_code[ISO_CODE_ESC] = 1;
3388           else
3389             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3390           goto label_loop_detect_coding;
3391         }
3392       if (priorities)
3393         goto label_return_highest_only;
3394     }
3395   else
3396     {
3397       int try;
3398
3399       if (c < 0xA0)
3400         {
3401           /* C is the first byte of SJIS character code,
3402              or a leading-code of Emacs' internal format (emacs-mule).  */
3403           try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3404
3405           /* Or, if C is a special latin extra code,
3406              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3407              or is an ISO2022 control-sequence-introducer (CSI),
3408              we should also consider the possibility of ISO2022 codings.  */
3409           if ((VECTORP (Vlatin_extra_code_table)
3410                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3411               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3412               || (c == ISO_CODE_CSI
3413                   && (src < src_end
3414                       && (*src == ']'
3415                           || ((*src == '0' || *src == '1' || *src == '2')
3416                               && src + 1 < src_end
3417                               && src[1] == ']')))))
3418             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3419                      | CODING_CATEGORY_MASK_ISO_8BIT);
3420         }
3421       else
3422         /* C is a character of ISO2022 in graphic plane right,
3423            or a SJIS's 1-byte character code (i.e. JISX0201),
3424            or the first byte of BIG5's 2-byte code.  */
3425         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3426                 | CODING_CATEGORY_MASK_ISO_8BIT
3427                 | CODING_CATEGORY_MASK_SJIS
3428                 | CODING_CATEGORY_MASK_BIG5);
3429
3430       /* Or, we may have to consider the possibility of CCL.  */
3431       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3432           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3433               ->spec.ccl.valid_codes)[c])
3434         try |= CODING_CATEGORY_MASK_CCL;
3435
3436       mask = 0;
3437       if (priorities)
3438         {
3439           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3440             {
3441               if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
3442                 mask = detect_coding_iso2022 (src, src_end);
3443               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3444                 mask = detect_coding_sjis (src, src_end);
3445               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3446                 mask = detect_coding_big5 (src, src_end);
3447               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3448                 mask = detect_coding_emacs_mule (src, src_end);
3449               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3450                 mask = detect_coding_ccl (src, src_end);
3451               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3452                 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3453               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3454                 mask = CODING_CATEGORY_MASK_BINARY;
3455               if (mask)
3456                 goto label_return_highest_only;
3457             }
3458           return CODING_CATEGORY_MASK_RAW_TEXT;
3459         }
3460       if (try & CODING_CATEGORY_MASK_ISO)
3461         mask |= detect_coding_iso2022 (src, src_end);
3462       if (try & CODING_CATEGORY_MASK_SJIS)
3463         mask |= detect_coding_sjis (src, src_end);
3464       if (try & CODING_CATEGORY_MASK_BIG5)
3465         mask |= detect_coding_big5 (src, src_end);
3466       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3467         mask |= detect_coding_emacs_mule (src, src_end);
3468       if (try & CODING_CATEGORY_MASK_CCL)
3469         mask |= detect_coding_ccl (src, src_end);
3470     }
3471   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3472
3473  label_return_highest_only:
3474   for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3475     {
3476       if (mask & priorities[i])
3477         return priorities[i];
3478     }
3479   return CODING_CATEGORY_MASK_RAW_TEXT;
3480 }
3481
3482 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3483    The information of the detected coding system is set in CODING.  */
3484
3485 void
3486 detect_coding (coding, src, src_bytes)
3487      struct coding_system *coding;
3488      unsigned char *src;
3489      int src_bytes;
3490 {
3491   unsigned int idx;
3492   int skip, mask, i;
3493   Lisp_Object val;
3494
3495   val = Vcoding_category_list;
3496   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3497   coding->heading_ascii = skip;
3498
3499   if (!mask) return;
3500
3501   /* We found a single coding system of the highest priority in MASK.  */
3502   idx = 0;
3503   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3504   if (! mask)
3505     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3506
3507   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3508
3509   if (coding->eol_type != CODING_EOL_UNDECIDED)
3510     {
3511       Lisp_Object tmp;
3512
3513       tmp = Fget (val, Qeol_type);
3514       if (VECTORP (tmp))
3515         val = XVECTOR (tmp)->contents[coding->eol_type];
3516     }
3517   setup_coding_system (val, coding);
3518   /* Set this again because setup_coding_system reset this member.  */
3519   coding->heading_ascii = skip;
3520 }
3521
3522 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3523    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3524    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3525
3526    How many non-eol characters are at the head is returned as *SKIP.  */
3527
3528 #define MAX_EOL_CHECK_COUNT 3
3529
3530 static int
3531 detect_eol_type (source, src_bytes, skip)
3532      unsigned char *source;
3533      int src_bytes, *skip;
3534 {
3535   unsigned char *src = source, *src_end = src + src_bytes;
3536   unsigned char c;
3537   int total = 0;                /* How many end-of-lines are found so far.  */
3538   int eol_type = CODING_EOL_UNDECIDED;
3539   int this_eol_type;
3540
3541   *skip = 0;
3542
3543   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3544     {
3545       c = *src++;
3546       if (c == '\n' || c == '\r')
3547         {
3548           if (*skip == 0)
3549             *skip = src - 1 - source;
3550           total++;
3551           if (c == '\n')
3552             this_eol_type = CODING_EOL_LF;
3553           else if (src >= src_end || *src != '\n')
3554             this_eol_type = CODING_EOL_CR;
3555           else
3556             this_eol_type = CODING_EOL_CRLF, src++;
3557
3558           if (eol_type == CODING_EOL_UNDECIDED)
3559             /* This is the first end-of-line.  */
3560             eol_type = this_eol_type;
3561           else if (eol_type != this_eol_type)
3562             {
3563               /* The found type is different from what found before.  */
3564               eol_type = CODING_EOL_INCONSISTENT;
3565               break;
3566             }
3567         }
3568     }
3569
3570   if (*skip == 0)
3571     *skip = src_end - source;
3572   return eol_type;
3573 }
3574
3575 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3576    is encoded.  If it detects an appropriate format of end-of-line, it
3577    sets the information in *CODING.  */
3578
3579 void
3580 detect_eol (coding, src, src_bytes)
3581      struct coding_system *coding;
3582      unsigned char *src;
3583      int src_bytes;
3584 {
3585   Lisp_Object val;
3586   int skip;
3587   int eol_type = detect_eol_type (src, src_bytes, &skip);
3588
3589   if (coding->heading_ascii > skip)
3590     coding->heading_ascii = skip;
3591   else
3592     skip = coding->heading_ascii;
3593
3594   if (eol_type == CODING_EOL_UNDECIDED)
3595     return;
3596   if (eol_type == CODING_EOL_INCONSISTENT)
3597     {
3598 #if 0
3599       /* This code is suppressed until we find a better way to
3600          distinguish raw text file and binary file.  */
3601
3602       /* If we have already detected that the coding is raw-text, the
3603          coding should actually be no-conversion.  */
3604       if (coding->type == coding_type_raw_text)
3605         {
3606           setup_coding_system (Qno_conversion, coding);
3607           return;
3608         }
3609       /* Else, let's decode only text code anyway.  */
3610 #endif /* 0 */
3611       eol_type = CODING_EOL_LF;
3612     }
3613
3614   val = Fget (coding->symbol, Qeol_type);
3615   if (VECTORP (val) && XVECTOR (val)->size == 3)
3616     {
3617       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3618       coding->heading_ascii = skip;
3619     }
3620 }
3621
3622 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3623
3624 #define DECODING_BUFFER_MAG(coding)                                          \
3625   (coding->type == coding_type_iso2022                                       \
3626    ? 3                                                                       \
3627    : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3628       ? 2                                                                    \
3629       : (coding->type == coding_type_raw_text                                \
3630          ? 1                                                                 \
3631          : (coding->type == coding_type_ccl                                  \
3632             ? coding->spec.ccl.decoder.buf_magnification                     \
3633             : 2))))
3634
3635 /* Return maximum size (bytes) of a buffer enough for decoding
3636    SRC_BYTES of text encoded in CODING.  */
3637
3638 int
3639 decoding_buffer_size (coding, src_bytes)
3640      struct coding_system *coding;
3641      int src_bytes;
3642 {
3643   return (src_bytes * DECODING_BUFFER_MAG (coding)
3644           + CONVERSION_BUFFER_EXTRA_ROOM);
3645 }
3646
3647 /* Return maximum size (bytes) of a buffer enough for encoding
3648    SRC_BYTES of text to CODING.  */
3649
3650 int
3651 encoding_buffer_size (coding, src_bytes)
3652      struct coding_system *coding;
3653      int src_bytes;
3654 {
3655   int magnification;
3656
3657   if (coding->type == coding_type_ccl)
3658     magnification = coding->spec.ccl.encoder.buf_magnification;
3659   else
3660     magnification = 3;
3661
3662   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3663 }
3664
3665 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3666 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3667 #endif
3668
3669 char *conversion_buffer;
3670 int conversion_buffer_size;
3671
3672 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3673    or decoding.  Sufficient memory is allocated automatically.  If we
3674    run out of memory, return NULL.  */
3675
3676 char *
3677 get_conversion_buffer (size)
3678      int size;
3679 {
3680   if (size > conversion_buffer_size)
3681     {
3682       char *buf;
3683       int real_size = conversion_buffer_size * 2;
3684
3685       while (real_size < size) real_size *= 2;
3686       buf = (char *) xmalloc (real_size);
3687       xfree (conversion_buffer);
3688       conversion_buffer = buf;
3689       conversion_buffer_size = real_size;
3690     }
3691   return conversion_buffer;
3692 }
3693
3694 int
3695 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3696      struct coding_system *coding;
3697      unsigned char *source, *destination;
3698      int src_bytes, dst_bytes, encodep;
3699 {
3700   struct ccl_program *ccl
3701     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3702   int result;
3703
3704   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3705
3706   coding->produced = ccl_driver (ccl, source, destination,
3707                                  src_bytes, dst_bytes, &(coding->consumed));
3708   coding->produced_char
3709     = (encodep
3710        ? coding->produced
3711        : multibyte_chars_in_text (destination, coding->produced));
3712   coding->consumed_char
3713     = multibyte_chars_in_text (source, coding->consumed);
3714
3715   switch (ccl->status)
3716     {
3717     case CCL_STAT_SUSPEND_BY_SRC:
3718       result = CODING_FINISH_INSUFFICIENT_SRC;
3719       break;
3720     case CCL_STAT_SUSPEND_BY_DST:
3721       result = CODING_FINISH_INSUFFICIENT_DST;
3722       break;
3723     case CCL_STAT_QUIT:
3724     case CCL_STAT_INVALID_CMD:
3725       result = CODING_FINISH_INTERRUPT;
3726       break;
3727     default:
3728       result = CODING_FINISH_NORMAL;
3729       break;
3730     }
3731   return result;
3732 }
3733
3734 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
3735    decoding, it may detect coding system and format of end-of-line if
3736    those are not yet decided.
3737
3738    This function does not make full use of DESTINATION buffer.  For
3739    instance, if coding->type is coding_type_iso2022, it uses only
3740    (DST_BYTES - 7) bytes of DESTINATION buffer.  In the case that
3741    DST_BYTES is decided by the function decoding_buffer_size, it
3742    contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3743    So, this function can decode the full SOURCE.  But, in the other
3744    case, if you want to avoid carry over, you must supply at least 7
3745    bytes more area in DESTINATION buffer than expected maximum bytes
3746    that will be produced by this function.  */
3747
3748 int
3749 decode_coding (coding, source, destination, src_bytes, dst_bytes)
3750      struct coding_system *coding;
3751      unsigned char *source, *destination;
3752      int src_bytes, dst_bytes;
3753 {
3754   int result;
3755
3756   if (src_bytes <= 0
3757       && coding->type != coding_type_ccl
3758       && ! (coding->mode & CODING_MODE_LAST_BLOCK
3759             && CODING_REQUIRE_FLUSHING (coding)))
3760     {
3761       coding->produced = coding->produced_char = 0;
3762       coding->consumed = coding->consumed_char = 0;
3763       coding->fake_multibyte = 0;
3764       return CODING_FINISH_NORMAL;
3765     }
3766
3767   if (coding->type == coding_type_undecided)
3768     detect_coding (coding, source, src_bytes);
3769
3770   if (coding->eol_type == CODING_EOL_UNDECIDED)
3771     detect_eol (coding, source, src_bytes);
3772
3773   switch (coding->type)
3774     {
3775     case coding_type_emacs_mule:
3776     case coding_type_undecided:
3777     case coding_type_raw_text:
3778       if (coding->eol_type == CODING_EOL_LF
3779           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3780         goto label_no_conversion;
3781       result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
3782       break;
3783
3784     case coding_type_sjis:
3785       result = decode_coding_sjis_big5 (coding, source, destination,
3786                                         src_bytes, dst_bytes, 1);
3787       break;
3788
3789     case coding_type_iso2022:
3790       result = decode_coding_iso2022 (coding, source, destination,
3791                                       src_bytes, dst_bytes);
3792       break;
3793
3794     case coding_type_big5:
3795       result = decode_coding_sjis_big5 (coding, source, destination,
3796                                         src_bytes, dst_bytes, 0);
3797       break;
3798
3799     case coding_type_ccl:
3800       result = ccl_coding_driver (coding, source, destination,
3801                                   src_bytes, dst_bytes, 0);
3802       break;
3803
3804     default:                    /* i.e. case coding_type_no_conversion: */
3805     label_no_conversion:
3806       if (dst_bytes && src_bytes > dst_bytes)
3807         {
3808           coding->produced = dst_bytes;
3809           result = CODING_FINISH_INSUFFICIENT_DST;
3810         }
3811       else
3812         {
3813           coding->produced = src_bytes;
3814           result = CODING_FINISH_NORMAL;
3815         }
3816       if (dst_bytes)
3817         bcopy (source, destination, coding->produced);
3818       else
3819         safe_bcopy (source, destination, coding->produced);
3820       coding->fake_multibyte = 1;
3821       coding->consumed
3822         = coding->consumed_char = coding->produced_char = coding->produced;
3823       break;
3824     }
3825
3826   return result;
3827 }
3828
3829 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
3830
3831    This function does not make full use of DESTINATION buffer.  For
3832    instance, if coding->type is coding_type_iso2022, it uses only
3833    (DST_BYTES - 20) bytes of DESTINATION buffer.  In the case that
3834    DST_BYTES is decided by the function encoding_buffer_size, it
3835    contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3836    So, this function can encode the full SOURCE.  But, in the other
3837    case, if you want to avoid carry over, you must supply at least 20
3838    bytes more area in DESTINATION buffer than expected maximum bytes
3839    that will be produced by this function.  */
3840
3841 int
3842 encode_coding (coding, source, destination, src_bytes, dst_bytes)
3843      struct coding_system *coding;
3844      unsigned char *source, *destination;
3845      int src_bytes, dst_bytes;
3846 {
3847   int result;
3848
3849   if (src_bytes <= 0
3850       && ! (coding->mode & CODING_MODE_LAST_BLOCK
3851             && CODING_REQUIRE_FLUSHING (coding)))
3852     {
3853       coding->produced = coding->produced_char = 0;
3854       coding->consumed = coding->consumed_char = 0;
3855       coding->fake_multibyte = 0;
3856       return CODING_FINISH_NORMAL;
3857     }
3858
3859   switch (coding->type)
3860     {
3861     case coding_type_emacs_mule:
3862     case coding_type_undecided:
3863     case coding_type_raw_text:
3864       if (coding->eol_type == CODING_EOL_LF
3865           ||  coding->eol_type == CODING_EOL_UNDECIDED)
3866         goto label_no_conversion;
3867       result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
3868       break;
3869
3870     case coding_type_sjis:
3871       result = encode_coding_sjis_big5 (coding, source, destination,
3872                                         src_bytes, dst_bytes, 1);
3873       break;
3874
3875     case coding_type_iso2022:
3876       result = encode_coding_iso2022 (coding, source, destination,
3877                                       src_bytes, dst_bytes);
3878       break;
3879
3880     case coding_type_big5:
3881       result = encode_coding_sjis_big5 (coding, source, destination,
3882                                         src_bytes, dst_bytes, 0);
3883       break;
3884
3885     case coding_type_ccl:
3886       result = ccl_coding_driver (coding, source, destination,
3887                                   src_bytes, dst_bytes, 1);
3888       break;
3889
3890     default:                    /* i.e. case coding_type_no_conversion: */
3891     label_no_conversion:
3892       if (dst_bytes && src_bytes > dst_bytes)
3893         {
3894           coding->produced = dst_bytes;
3895           result = CODING_FINISH_INSUFFICIENT_DST;
3896         }
3897       else
3898         {
3899           coding->produced = src_bytes;
3900           result = CODING_FINISH_NORMAL;
3901         }
3902       if (dst_bytes)
3903         bcopy (source, destination, coding->produced);
3904       else
3905         safe_bcopy (source, destination, coding->produced);
3906       if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3907         {
3908           unsigned char *p = destination, *pend = p + coding->produced;
3909           while (p < pend)
3910             if (*p++ == '\015') p[-1] = '\n';
3911         }
3912       coding->fake_multibyte = 1;
3913       coding->consumed
3914         = coding->consumed_char = coding->produced_char = coding->produced;
3915       break;
3916     }
3917
3918   return result;
3919 }
3920
3921 /* Scan text in the region between *BEG and *END (byte positions),
3922    skip characters which we don't have to decode by coding system
3923    CODING at the head and tail, then set *BEG and *END to the region
3924    of the text we actually have to convert.  The caller should move
3925    the gap out of the region in advance.
3926
3927    If STR is not NULL, *BEG and *END are indices into STR.  */
3928
3929 static void
3930 shrink_decoding_region (beg, end, coding, str)
3931      int *beg, *end;
3932      struct coding_system *coding;
3933      unsigned char *str;
3934 {
3935   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
3936   int eol_conversion;
3937   Lisp_Object translation_table;
3938
3939   if (coding->type == coding_type_ccl
3940       || coding->type == coding_type_undecided
3941       || !NILP (coding->post_read_conversion))
3942     {
3943       /* We can't skip any data.  */
3944       return;
3945     }
3946   else if (coding->type == coding_type_no_conversion)
3947     {
3948       /* We need no conversion, but don't have to skip any data here.
3949          Decoding routine handles them effectively anyway.  */
3950       return;
3951     }
3952
3953   translation_table = coding->translation_table_for_decode;
3954   if (NILP (translation_table) && !NILP (Venable_character_translation))
3955     translation_table = Vstandard_translation_table_for_decode;
3956   if (CHAR_TABLE_P (translation_table))
3957     {
3958       int i;
3959       for (i = 0; i < 128; i++)
3960         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3961           break;
3962       if (i < 128)
3963         /* Some ASCII character should be tranlsated.  We give up
3964            shrinking.  */
3965         return;
3966     }
3967
3968   eol_conversion = (coding->eol_type != CODING_EOL_LF);
3969
3970   if ((! eol_conversion) && (coding->heading_ascii >= 0))
3971     /* Detection routine has already found how much we can skip at the
3972        head.  */
3973     *beg += coding->heading_ascii;
3974
3975   if (str)
3976     {
3977       begp_orig = begp = str + *beg;
3978       endp_orig = endp = str + *end;
3979     }
3980   else
3981     {
3982       begp_orig = begp = BYTE_POS_ADDR (*beg);
3983       endp_orig = endp = begp + *end - *beg;
3984     }
3985
3986   switch (coding->type)
3987     {
3988     case coding_type_emacs_mule:
3989     case coding_type_raw_text:
3990       if (eol_conversion)
3991         {
3992           if (coding->heading_ascii < 0)
3993             while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
3994           while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
3995             endp--;
3996           /* Do not consider LF as ascii if preceded by CR, since that
3997              confuses eol decoding. */
3998           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3999             endp++;
4000         }
4001       else
4002         begp = endp;
4003       break;
4004
4005     case coding_type_sjis:
4006     case coding_type_big5:
4007       /* We can skip all ASCII characters at the head.  */
4008       if (coding->heading_ascii < 0)
4009         {
4010           if (eol_conversion)
4011             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4012           else
4013             while (begp < endp && *begp < 0x80) begp++;
4014         }
4015       /* We can skip all ASCII characters at the tail except for the
4016          second byte of SJIS or BIG5 code.  */
4017       if (eol_conversion)
4018         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4019       else
4020         while (begp < endp && endp[-1] < 0x80) endp--;
4021       /* Do not consider LF as ascii if preceded by CR, since that
4022          confuses eol decoding. */
4023       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4024         endp++;
4025       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4026         endp++;
4027       break;
4028
4029     default:            /* i.e. case coding_type_iso2022: */
4030       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4031         /* We can't skip any data.  */
4032         break;
4033       if (coding->heading_ascii < 0)
4034         {
4035           /* We can skip all ASCII characters at the head except for a
4036              few control codes.  */
4037           while (begp < endp && (c = *begp) < 0x80
4038                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4039                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4040                  && (!eol_conversion || c != ISO_CODE_LF))
4041             begp++;
4042         }
4043       switch (coding->category_idx)
4044         {
4045         case CODING_CATEGORY_IDX_ISO_8_1:
4046         case CODING_CATEGORY_IDX_ISO_8_2:
4047           /* We can skip all ASCII characters at the tail.  */
4048           if (eol_conversion)
4049             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4050           else
4051             while (begp < endp && endp[-1] < 0x80) endp--;
4052           /* Do not consider LF as ascii if preceded by CR, since that
4053              confuses eol decoding. */
4054           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4055             endp++;
4056           break;
4057
4058         case CODING_CATEGORY_IDX_ISO_7:
4059         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4060           {
4061             /* We can skip all charactes at the tail except for 8-bit
4062                codes and ESC and the following 2-byte at the tail.  */
4063             unsigned char *eight_bit = NULL;
4064
4065             if (eol_conversion)
4066               while (begp < endp
4067                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4068                 {
4069                   if (!eight_bit && c & 0x80) eight_bit = endp;
4070                   endp--;
4071                 }
4072             else
4073               while (begp < endp
4074                      && (c = endp[-1]) != ISO_CODE_ESC)
4075                 {
4076                   if (!eight_bit && c & 0x80) eight_bit = endp;
4077                   endp--;
4078                 }
4079             /* Do not consider LF as ascii if preceded by CR, since that
4080                confuses eol decoding. */
4081             if (begp < endp && endp < endp_orig
4082                 && endp[-1] == '\r' && endp[0] == '\n')
4083               endp++;
4084             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4085               {
4086                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4087                   /* This is an ASCII designation sequence.  We can
4088                      surely skip the tail.  But, if we have
4089                      encountered an 8-bit code, skip only the codes
4090                      after that.  */
4091                   endp = eight_bit ? eight_bit : endp + 2;
4092                 else
4093                   /* Hmmm, we can't skip the tail.  */
4094                   endp = endp_orig;
4095               }
4096             else if (eight_bit)
4097               endp = eight_bit;
4098           }
4099         }
4100     }
4101   *beg += begp - begp_orig;
4102   *end += endp - endp_orig;
4103   return;
4104 }
4105
4106 /* Like shrink_decoding_region but for encoding.  */
4107
4108 static void
4109 shrink_encoding_region (beg, end, coding, str)
4110      int *beg, *end;
4111      struct coding_system *coding;
4112      unsigned char *str;
4113 {
4114   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4115   int eol_conversion;
4116   Lisp_Object translation_table;
4117
4118   if (coding->type == coding_type_ccl)
4119     /* We can't skip any data.  */
4120     return;
4121   else if (coding->type == coding_type_no_conversion)
4122     {
4123       /* We need no conversion.  */
4124       *beg = *end;
4125       return;
4126     }
4127
4128   translation_table = coding->translation_table_for_encode;
4129   if (NILP (translation_table) && !NILP (Venable_character_translation))
4130     translation_table = Vstandard_translation_table_for_encode;
4131   if (CHAR_TABLE_P (translation_table))
4132     {
4133       int i;
4134       for (i = 0; i < 128; i++)
4135         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4136           break;
4137       if (i < 128)
4138         /* Some ASCII character should be tranlsated.  We give up
4139            shrinking.  */
4140         return;
4141     }
4142
4143   if (str)
4144     {
4145       begp_orig = begp = str + *beg;
4146       endp_orig = endp = str + *end;
4147     }
4148   else
4149     {
4150       begp_orig = begp = BYTE_POS_ADDR (*beg);
4151       endp_orig = endp = begp + *end - *beg;
4152     }
4153
4154   eol_conversion = (coding->eol_type == CODING_EOL_CR
4155                     || coding->eol_type == CODING_EOL_CRLF);
4156
4157   /* Here, we don't have to check coding->pre_write_conversion because
4158      the caller is expected to have handled it already.  */
4159   switch (coding->type)
4160     {
4161     case coding_type_undecided:
4162     case coding_type_emacs_mule:
4163     case coding_type_raw_text:
4164       if (eol_conversion)
4165         {
4166           while (begp < endp && *begp != '\n') begp++;
4167           while (begp < endp && endp[-1] != '\n') endp--;
4168         }
4169       else
4170         begp = endp;
4171       break;
4172
4173     case coding_type_iso2022:
4174       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4175         /* We can't skip any data.  */
4176         break;
4177       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4178         {
4179           unsigned char *bol = begp;
4180           while (begp < endp && *begp < 0x80)
4181             {
4182               begp++;
4183               if (begp[-1] == '\n')
4184                 bol = begp;
4185             }
4186           begp = bol;
4187           goto label_skip_tail;
4188         }
4189       /* fall down ... */
4190
4191     default:
4192       /* We can skip all ASCII characters at the head and tail.  */
4193       if (eol_conversion)
4194         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4195       else
4196         while (begp < endp && *begp < 0x80) begp++;
4197     label_skip_tail:
4198       if (eol_conversion)
4199         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4200       else
4201         while (begp < endp && *(endp - 1) < 0x80) endp--;
4202       break;
4203     }
4204
4205   *beg += begp - begp_orig;
4206   *end += endp - endp_orig;
4207   return;
4208 }
4209
4210 /* As shrinking conversion region requires some overhead, we don't try
4211    shrinking if the length of conversion region is less than this
4212    value.  */
4213 static int shrink_conversion_region_threshhold = 1024;
4214
4215 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4216   do {                                                                  \
4217     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4218       {                                                                 \
4219         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4220         else shrink_decoding_region (beg, end, coding, str);            \
4221       }                                                                 \
4222   } while (0)
4223
4224 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4225    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4226    coding system CODING, and return the status code of code conversion
4227    (currently, this value has no meaning).
4228
4229    How many characters (and bytes) are converted to how many
4230    characters (and bytes) are recorded in members of the structure
4231    CODING.
4232
4233    If REPLACE is nonzero, we do various things as if the original text
4234    is deleted and a new text is inserted.  See the comments in
4235    replace_range (insdel.c) to know what we are doing.  */
4236
4237 int
4238 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4239      int from, from_byte, to, to_byte, encodep, replace;
4240      struct coding_system *coding;
4241 {
4242   int len = to - from, len_byte = to_byte - from_byte;
4243   int require, inserted, inserted_byte;
4244   int head_skip, tail_skip, total_skip;
4245   Lisp_Object saved_coding_symbol;
4246   int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4247   int first = 1;
4248   int fake_multibyte = 0;
4249   unsigned char *src, *dst;
4250   Lisp_Object deletion;
4251   int orig_point = PT, orig_len = len;
4252   int prev_Z;
4253
4254   deletion = Qnil;
4255   saved_coding_symbol = Qnil;
4256
4257   if (from < PT && PT < to)
4258     {
4259       TEMP_SET_PT_BOTH (from, from_byte);
4260       orig_point = from;
4261     }
4262
4263   if (replace)
4264     {
4265       int saved_from = from;
4266
4267       prepare_to_modify_buffer (from, to, &from);
4268       if (saved_from != from)
4269         {
4270           to = from + len;
4271           if (multibyte)
4272             from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4273           else
4274             from_byte = from, to_byte = to;
4275           len_byte = to_byte - from_byte;
4276         }
4277     }
4278
4279   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4280     {
4281       /* We must detect encoding of text and eol format.  */
4282
4283       if (from < GPT && to > GPT)
4284         move_gap_both (from, from_byte);
4285       if (coding->type == coding_type_undecided)
4286         {
4287           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4288           if (coding->type == coding_type_undecided)
4289             /* It seems that the text contains only ASCII, but we
4290                should not left it undecided because the deeper
4291                decoding routine (decode_coding) tries to detect the
4292                encodings again in vain.  */
4293             coding->type = coding_type_emacs_mule;
4294         }
4295       if (coding->eol_type == CODING_EOL_UNDECIDED)
4296         {
4297           saved_coding_symbol = coding->symbol;
4298           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4299           if (coding->eol_type == CODING_EOL_UNDECIDED)
4300             coding->eol_type = CODING_EOL_LF;
4301           /* We had better recover the original eol format if we
4302              encounter an inconsitent eol format while decoding.  */
4303           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4304         }
4305     }
4306
4307   coding->consumed_char = len, coding->consumed = len_byte;
4308
4309   if (encodep
4310       ? ! CODING_REQUIRE_ENCODING (coding)
4311       : ! CODING_REQUIRE_DECODING (coding))
4312     {
4313       coding->produced = len_byte;
4314       if (multibyte
4315           && ! replace
4316           /* See the comment of the member heading_ascii in coding.h.  */
4317           && coding->heading_ascii < len_byte)
4318         {
4319           /* We still may have to combine byte at the head and the
4320              tail of the text in the region.  */
4321           if (from < GPT && GPT < to)
4322             move_gap_both (to, to_byte);
4323           len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4324           adjust_after_insert (from, from_byte, to, to_byte, len);
4325           coding->produced_char = len;
4326         }
4327       else
4328         {
4329           if (!replace)
4330             adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4331           coding->produced_char = len_byte;
4332         }
4333       return 0;
4334     }
4335
4336   /* Now we convert the text.  */
4337
4338   /* For encoding, we must process pre-write-conversion in advance.  */
4339   if (encodep
4340       && ! NILP (coding->pre_write_conversion)
4341       && SYMBOLP (coding->pre_write_conversion)
4342       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4343     {
4344       /* The function in pre-write-conversion may put a new text in a
4345          new buffer.  */
4346       struct buffer *prev = current_buffer;
4347       Lisp_Object new;
4348
4349       call2 (coding->pre_write_conversion,
4350              make_number (from), make_number (to));
4351       if (current_buffer != prev)
4352         {
4353           len = ZV - BEGV;
4354           new = Fcurrent_buffer ();
4355           set_buffer_internal_1 (prev);
4356           del_range_2 (from, from_byte, to, to_byte);
4357           TEMP_SET_PT_BOTH (from, from_byte);
4358           insert_from_buffer (XBUFFER (new), 1, len, 0);
4359           Fkill_buffer (new);
4360           if (orig_point >= to)
4361             orig_point += len - orig_len;
4362           else if (orig_point > from)
4363             orig_point = from;
4364           orig_len = len;
4365           to = from + len;
4366           from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
4367           to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
4368           len_byte = to_byte - from_byte;
4369           TEMP_SET_PT_BOTH (from, from_byte);
4370         }
4371     }
4372
4373   if (replace)
4374     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4375
4376   /* Try to skip the heading and tailing ASCIIs.  */
4377   {
4378     int from_byte_orig = from_byte, to_byte_orig = to_byte;
4379
4380     if (from < GPT && GPT < to)
4381       move_gap_both (from, from_byte);
4382     SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4383     if (from_byte == to_byte
4384         && coding->type != coding_type_ccl
4385         && ! (coding->mode & CODING_MODE_LAST_BLOCK
4386               && CODING_REQUIRE_FLUSHING (coding)))
4387       {
4388         coding->produced = len_byte;
4389         coding->produced_char = multibyte ? len : len_byte;
4390         if (!replace)
4391           /* We must record and adjust for this new text now.  */
4392           adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4393         return 0;
4394       }
4395
4396     head_skip = from_byte - from_byte_orig;
4397     tail_skip = to_byte_orig - to_byte;
4398     total_skip = head_skip + tail_skip;
4399     from += head_skip;
4400     to -= tail_skip;
4401     len -= total_skip; len_byte -= total_skip;
4402   }
4403
4404   /* The code conversion routine can not preserve text properties for
4405      now.  So, we must remove all text properties in the region.
4406      Here, we must suppress all modification hooks.  */
4407   if (replace)
4408     {
4409       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4410       inhibit_modification_hooks = 1;
4411       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4412       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4413     }
4414
4415   /* For converion, we must put the gap before the text in addition to
4416      making the gap larger for efficient decoding.  The required gap
4417      size starts from 2000 which is the magic number used in make_gap.
4418      But, after one batch of conversion, it will be incremented if we
4419      find that it is not enough .  */
4420   require = 2000;
4421
4422   if (GAP_SIZE  < require)
4423     make_gap (require - GAP_SIZE);
4424   move_gap_both (from, from_byte);
4425
4426   inserted = inserted_byte = 0;
4427   src = GAP_END_ADDR, dst = GPT_ADDR;
4428
4429   GAP_SIZE += len_byte;
4430   ZV -= len;
4431   Z -= len;
4432   ZV_BYTE -= len_byte;
4433   Z_BYTE -= len_byte;
4434
4435   if (GPT - BEG < BEG_UNCHANGED)
4436     BEG_UNCHANGED = GPT - BEG;
4437   if (Z - GPT < END_UNCHANGED)
4438     END_UNCHANGED = Z - GPT;
4439
4440   for (;;)
4441     {
4442       int result;
4443
4444       /* The buffer memory is changed from:
4445          +--------+converted-text+---------+-------original-text------+---+
4446          |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4447                   |<------------------- GAP_SIZE -------------------->|  */
4448       if (encodep)
4449         result = encode_coding (coding, src, dst, len_byte, 0);
4450       else
4451         result = decode_coding (coding, src, dst, len_byte, 0);
4452       /* to:
4453          +--------+-------converted-text--------+--+---original-text--+---+
4454          |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4455                   |<------------------- GAP_SIZE -------------------->|  */
4456       if (coding->fake_multibyte)
4457         fake_multibyte = 1;
4458
4459       if (!encodep && !multibyte)
4460         coding->produced_char = coding->produced;
4461       inserted += coding->produced_char;
4462       inserted_byte += coding->produced;
4463       len_byte -= coding->consumed;
4464       src += coding->consumed;
4465       dst += inserted_byte;
4466
4467       if (result == CODING_FINISH_NORMAL)
4468         {
4469           src += len_byte;
4470           break;
4471         }
4472       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4473         {
4474           unsigned char *pend = dst, *p = pend - inserted_byte;
4475           Lisp_Object eol_type;
4476
4477           /* Encode LFs back to the original eol format (CR or CRLF).  */
4478           if (coding->eol_type == CODING_EOL_CR)
4479             {
4480               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4481             }
4482           else
4483             {
4484               int count = 0;
4485
4486               while (p < pend) if (*p++ == '\n') count++;
4487               if (src - dst < count)
4488                 {
4489                   /* We don't have sufficient room for encoding LFs
4490                      back to CRLF.  We must record converted and
4491                      not-yet-converted text back to the buffer
4492                      content, enlarge the gap, then record them out of
4493                      the buffer contents again.  */
4494                   int add = len_byte + inserted_byte;
4495
4496                   GAP_SIZE -= add;
4497                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4498                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4499                   make_gap (count - GAP_SIZE);
4500                   GAP_SIZE += add;
4501                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4502                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4503                   /* Don't forget to update SRC, DST, and PEND.  */
4504                   src = GAP_END_ADDR - len_byte;
4505                   dst = GPT_ADDR + inserted_byte;
4506                   pend = dst;
4507                 }
4508               inserted += count;
4509               inserted_byte += count;
4510               coding->produced += count;
4511               p = dst = pend + count;
4512               while (count)
4513                 {
4514                   *--p = *--pend;
4515                   if (*p == '\n') count--, *--p = '\r';
4516                 }
4517             }
4518
4519           /* Suppress eol-format conversion in the further conversion.  */
4520           coding->eol_type = CODING_EOL_LF;
4521
4522           /* Set the coding system symbol to that for Unix-like EOL.  */
4523           eol_type = Fget (saved_coding_symbol, Qeol_type);
4524           if (VECTORP (eol_type)
4525               && XVECTOR (eol_type)->size == 3
4526               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4527             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4528           else
4529             coding->symbol = saved_coding_symbol;
4530
4531           continue;
4532         }
4533       if (len_byte <= 0)
4534         {
4535           if (coding->type != coding_type_ccl
4536               || coding->mode & CODING_MODE_LAST_BLOCK)
4537             break;
4538           coding->mode |= CODING_MODE_LAST_BLOCK;
4539           continue;
4540         }
4541       if (result == CODING_FINISH_INSUFFICIENT_SRC)
4542         {
4543           /* The source text ends in invalid codes.  Let's just
4544              make them valid buffer contents, and finish conversion.  */
4545           inserted += len_byte;
4546           inserted_byte += len_byte;
4547           while (len_byte--)
4548             *dst++ = *src++;
4549           fake_multibyte = 1;
4550           break;
4551         }
4552       if (result == CODING_FINISH_INTERRUPT)
4553         {
4554           /* The conversion procedure was interrupted by a user.  */
4555           fake_multibyte = 1;
4556           break;
4557         }
4558       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
4559       if (coding->consumed < 1)
4560         {
4561           /* It's quite strange to require more memory without
4562              consuming any bytes.  Perhaps CCL program bug.  */
4563           fake_multibyte = 1;
4564           break;
4565         }
4566       if (first)
4567         {
4568           /* We have just done the first batch of conversion which was
4569              stoped because of insufficient gap.  Let's reconsider the
4570              required gap size (i.e. SRT - DST) now.
4571
4572              We have converted ORIG bytes (== coding->consumed) into
4573              NEW bytes (coding->produced).  To convert the remaining
4574              LEN bytes, we may need REQUIRE bytes of gap, where:
4575                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4576                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4577              Here, we are sure that NEW >= ORIG.  */
4578           float ratio = coding->produced - coding->consumed;
4579           ratio /= coding->consumed;
4580           require = len_byte * ratio;
4581           first = 0;
4582         }
4583       if ((src - dst) < (require + 2000))
4584         {
4585           /* See the comment above the previous call of make_gap.  */
4586           int add = len_byte + inserted_byte;
4587
4588           GAP_SIZE -= add;
4589           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4590           GPT += inserted_byte; GPT_BYTE += inserted_byte;
4591           make_gap (require + 2000);
4592           GAP_SIZE += add;
4593           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4594           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4595           /* Don't forget to update SRC, DST.  */
4596           src = GAP_END_ADDR - len_byte;
4597           dst = GPT_ADDR + inserted_byte;
4598         }
4599     }
4600   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
4601
4602   if (multibyte
4603       && (encodep
4604           || fake_multibyte
4605           || (to - from) != (to_byte - from_byte)))
4606     inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
4607
4608   /* If we have shrinked the conversion area, adjust it now.  */
4609   if (total_skip > 0)
4610     {
4611       if (tail_skip > 0)
4612         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4613       inserted += total_skip; inserted_byte += total_skip;
4614       GAP_SIZE += total_skip;
4615       GPT -= head_skip; GPT_BYTE -= head_skip;
4616       ZV -= total_skip; ZV_BYTE -= total_skip;
4617       Z -= total_skip; Z_BYTE -= total_skip;
4618       from -= head_skip; from_byte -= head_skip;
4619       to += tail_skip; to_byte += tail_skip;
4620     }
4621
4622   prev_Z = Z;
4623   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
4624   inserted = Z - prev_Z;
4625
4626   if (! encodep && ! NILP (coding->post_read_conversion))
4627     {
4628       Lisp_Object val;
4629
4630       if (from != PT)
4631         TEMP_SET_PT_BOTH (from, from_byte);
4632       prev_Z = Z;
4633       val = call1 (coding->post_read_conversion, make_number (inserted));
4634       CHECK_NUMBER (val, 0);
4635       inserted += Z - prev_Z;
4636     }
4637
4638   if (orig_point >= from)
4639     {
4640       if (orig_point >= from + orig_len)
4641         orig_point += inserted - orig_len;
4642       else
4643         orig_point = from;
4644       TEMP_SET_PT (orig_point);
4645     }
4646
4647   signal_after_change (from, to - from, inserted);
4648
4649   {
4650     coding->consumed = to_byte - from_byte;
4651     coding->consumed_char = to - from;
4652     coding->produced = inserted_byte;
4653     coding->produced_char = inserted;
4654   }
4655
4656   return 0;
4657 }
4658
4659 Lisp_Object
4660 code_convert_string (str, coding, encodep, nocopy)
4661      Lisp_Object str;
4662      struct coding_system *coding;
4663      int encodep, nocopy;
4664 {
4665   int len;
4666   char *buf;
4667   int from = 0, to = XSTRING (str)->size;
4668   int to_byte = STRING_BYTES (XSTRING (str));
4669   struct gcpro gcpro1;
4670   Lisp_Object saved_coding_symbol;
4671   int result;
4672
4673   saved_coding_symbol = Qnil;
4674   if (encodep && !NILP (coding->pre_write_conversion)
4675       || !encodep && !NILP (coding->post_read_conversion))
4676     {
4677       /* Since we have to call Lisp functions which assume target text
4678          is in a buffer, after setting a temporary buffer, call
4679          code_convert_region.  */
4680       int count = specpdl_ptr - specpdl;
4681       struct buffer *prev = current_buffer;
4682
4683       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4684       temp_output_buffer_setup (" *code-converting-work*");
4685       set_buffer_internal (XBUFFER (Vstandard_output));
4686       if (encodep)
4687         insert_from_string (str, 0, 0, to, to_byte, 0);
4688       else
4689         {
4690           /* We must insert the contents of STR as is without
4691              unibyte<->multibyte conversion.  */
4692           current_buffer->enable_multibyte_characters = Qnil;
4693           insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4694           current_buffer->enable_multibyte_characters = Qt;
4695         }
4696       code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
4697       if (encodep)
4698         /* We must return the buffer contents as unibyte string.  */
4699         current_buffer->enable_multibyte_characters = Qnil;
4700       str = make_buffer_string (BEGV, ZV, 0);
4701       set_buffer_internal (prev);
4702       return unbind_to (count, str);
4703     }
4704
4705   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4706     {
4707       /* See the comments in code_convert_region.  */
4708       if (coding->type == coding_type_undecided)
4709         {
4710           detect_coding (coding, XSTRING (str)->data, to_byte);
4711           if (coding->type == coding_type_undecided)
4712             coding->type = coding_type_emacs_mule;
4713         }
4714       if (coding->eol_type == CODING_EOL_UNDECIDED)
4715         {
4716           saved_coding_symbol = coding->symbol;
4717           detect_eol (coding, XSTRING (str)->data, to_byte);
4718           if (coding->eol_type == CODING_EOL_UNDECIDED)
4719             coding->eol_type = CODING_EOL_LF;
4720           /* We had better recover the original eol format if we
4721              encounter an inconsitent eol format while decoding.  */
4722           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4723         }
4724     }
4725
4726   if (encodep
4727       ? ! CODING_REQUIRE_ENCODING (coding)
4728       : ! CODING_REQUIRE_DECODING (coding))
4729     from = to_byte;
4730   else
4731     {
4732       /* Try to skip the heading and tailing ASCIIs.  */
4733       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4734                                 encodep);
4735     }
4736   if (from == to_byte
4737       && coding->type != coding_type_ccl)
4738     return (nocopy ? str : Fcopy_sequence (str));
4739
4740   if (encodep)
4741     len = encoding_buffer_size (coding, to_byte - from);
4742   else
4743     len = decoding_buffer_size (coding, to_byte - from);
4744   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
4745   GCPRO1 (str);
4746   buf = get_conversion_buffer (len);
4747   UNGCPRO;
4748
4749   if (from > 0)
4750     bcopy (XSTRING (str)->data, buf, from);
4751   result = (encodep
4752             ? encode_coding (coding, XSTRING (str)->data + from,
4753                              buf + from, to_byte - from, len)
4754             : decode_coding (coding, XSTRING (str)->data + from,
4755                              buf + from, to_byte - from, len));
4756   if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4757     {
4758       /* We simple try to decode the whole string again but without
4759          eol-conversion this time.  */
4760       coding->eol_type = CODING_EOL_LF;
4761       coding->symbol = saved_coding_symbol;
4762       return code_convert_string (str, coding, encodep, nocopy);
4763     }
4764
4765   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
4766          STRING_BYTES (XSTRING (str)) - to_byte);
4767
4768   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
4769   if (encodep)
4770     str = make_unibyte_string (buf, len + coding->produced);
4771   else
4772     {
4773       int chars= (coding->fake_multibyte
4774                   ? multibyte_chars_in_text (buf + from, coding->produced)
4775                   : coding->produced_char);
4776       str = make_multibyte_string (buf, len + chars, len + coding->produced);
4777     }
4778
4779   return str;
4780 }
4781
4782 \f
4783 #ifdef emacs
4784 /*** 8. Emacs Lisp library functions ***/
4785
4786 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4787   "Return t if OBJECT is nil or a coding-system.\n\
4788 See the documentation of `make-coding-system' for information\n\
4789 about coding-system objects.")
4790   (obj)
4791      Lisp_Object obj;
4792 {
4793   if (NILP (obj))
4794     return Qt;
4795   if (!SYMBOLP (obj))
4796     return Qnil;
4797   /* Get coding-spec vector for OBJ.  */
4798   obj = Fget (obj, Qcoding_system);
4799   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4800           ? Qt : Qnil);
4801 }
4802
4803 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4804        Sread_non_nil_coding_system, 1, 1, 0,
4805   "Read a coding system from the minibuffer, prompting with string PROMPT.")
4806   (prompt)
4807      Lisp_Object prompt;
4808 {
4809   Lisp_Object val;
4810   do
4811     {
4812       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4813                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
4814     }
4815   while (XSTRING (val)->size == 0);
4816   return (Fintern (val, Qnil));
4817 }
4818
4819 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4820   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4821 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4822   (prompt, default_coding_system)
4823      Lisp_Object prompt, default_coding_system;
4824 {
4825   Lisp_Object val;
4826   if (SYMBOLP (default_coding_system))
4827     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4828   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4829                           Qt, Qnil, Qcoding_system_history,
4830                           default_coding_system, Qnil);
4831   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4832 }
4833
4834 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4835        1, 1, 0,
4836   "Check validity of CODING-SYSTEM.\n\
4837 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4838 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4839 The value of property should be a vector of length 5.")
4840   (coding_system)
4841      Lisp_Object coding_system;
4842 {
4843   CHECK_SYMBOL (coding_system, 0);
4844   if (!NILP (Fcoding_system_p (coding_system)))
4845     return coding_system;
4846   while (1)
4847     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4848 }
4849 \f
4850 Lisp_Object
4851 detect_coding_system (src, src_bytes, highest)
4852      unsigned char *src;
4853      int src_bytes, highest;
4854 {
4855   int coding_mask, eol_type;
4856   Lisp_Object val, tmp;
4857   int dummy;
4858
4859   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4860   eol_type  = detect_eol_type (src, src_bytes, &dummy);
4861   if (eol_type == CODING_EOL_INCONSISTENT)
4862     eol_type = CODING_EOL_UNDECIDED;
4863
4864   if (!coding_mask)
4865     {
4866       val = Qundecided;
4867       if (eol_type != CODING_EOL_UNDECIDED)
4868         {
4869           Lisp_Object val2;
4870           val2 = Fget (Qundecided, Qeol_type);
4871           if (VECTORP (val2))
4872             val = XVECTOR (val2)->contents[eol_type];
4873         }
4874       return (highest ? val : Fcons (val, Qnil));
4875     }
4876
4877   /* At first, gather possible coding systems in VAL.  */
4878   val = Qnil;
4879   for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCDR (tmp))
4880     {
4881       int idx
4882         = XFASTINT (Fget (XCAR (tmp), Qcoding_category_index));
4883       if (coding_mask & (1 << idx))
4884         {
4885           val = Fcons (Fsymbol_value (XCAR (tmp)), val);
4886           if (highest)
4887             break;
4888         }
4889     }
4890   if (!highest)
4891     val = Fnreverse (val);
4892
4893   /* Then, replace the elements with subsidiary coding systems.  */
4894   for (tmp = val; !NILP (tmp); tmp = XCDR (tmp))
4895     {
4896       if (eol_type != CODING_EOL_UNDECIDED
4897           && eol_type != CODING_EOL_INCONSISTENT)
4898         {
4899           Lisp_Object eol;
4900           eol = Fget (XCAR (tmp), Qeol_type);
4901           if (VECTORP (eol))
4902             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
4903         }
4904     }
4905   return (highest ? XCAR (val) : val);
4906 }
4907
4908 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4909        2, 3, 0,
4910   "Detect coding system of the text in the region between START and END.\n\
4911 Return a list of possible coding systems ordered by priority.\n\
4912 \n\
4913 If only ASCII characters are found, it returns a list of single element\n\
4914 `undecided' or its subsidiary coding system according to a detected\n\
4915 end-of-line format.\n\
4916 \n\
4917 If optional argument HIGHEST is non-nil, return the coding system of\n\
4918 highest priority.")
4919   (start, end, highest)
4920      Lisp_Object start, end, highest;
4921 {
4922   int from, to;
4923   int from_byte, to_byte;
4924
4925   CHECK_NUMBER_COERCE_MARKER (start, 0);
4926   CHECK_NUMBER_COERCE_MARKER (end, 1);
4927
4928   validate_region (&start, &end);
4929   from = XINT (start), to = XINT (end);
4930   from_byte = CHAR_TO_BYTE (from);
4931   to_byte = CHAR_TO_BYTE (to);
4932
4933   if (from < GPT && to >= GPT)
4934     move_gap_both (to, to_byte);
4935
4936   return detect_coding_system (BYTE_POS_ADDR (from_byte),
4937                                to_byte - from_byte,
4938                                !NILP (highest));
4939 }
4940
4941 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4942        1, 2, 0,
4943   "Detect coding system of the text in STRING.\n\
4944 Return a list of possible coding systems ordered by priority.\n\
4945 \n\
4946 If only ASCII characters are found, it returns a list of single element\n\
4947 `undecided' or its subsidiary coding system according to a detected\n\
4948 end-of-line format.\n\
4949 \n\
4950 If optional argument HIGHEST is non-nil, return the coding system of\n\
4951 highest priority.")
4952   (string, highest)
4953      Lisp_Object string, highest;
4954 {
4955   CHECK_STRING (string, 0);
4956
4957   return detect_coding_system (XSTRING (string)->data,
4958                                STRING_BYTES (XSTRING (string)),
4959                                !NILP (highest));
4960 }
4961
4962 Lisp_Object
4963 code_convert_region1 (start, end, coding_system, encodep)
4964      Lisp_Object start, end, coding_system;
4965      int encodep;
4966 {
4967   struct coding_system coding;
4968   int from, to, len;
4969
4970   CHECK_NUMBER_COERCE_MARKER (start, 0);
4971   CHECK_NUMBER_COERCE_MARKER (end, 1);
4972   CHECK_SYMBOL (coding_system, 2);
4973
4974   validate_region (&start, &end);
4975   from = XFASTINT (start);
4976   to = XFASTINT (end);
4977
4978   if (NILP (coding_system))
4979     return make_number (to - from);
4980
4981   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4982     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
4983
4984   coding.mode |= CODING_MODE_LAST_BLOCK;
4985   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4986                        &coding, encodep, 1);
4987   Vlast_coding_system_used = coding.symbol;
4988   return make_number (coding.produced_char);
4989 }
4990
4991 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4992        3, 3, "r\nzCoding system: ",
4993   "Decode the current region by specified coding system.\n\
4994 When called from a program, takes three arguments:\n\
4995 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
4996 This function sets `last-coding-system-used' to the precise coding system\n\
4997 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4998 not fully specified.)\n\
4999 It returns the length of the decoded text.")
5000   (start, end, coding_system)
5001      Lisp_Object start, end, coding_system;
5002 {
5003   return code_convert_region1 (start, end, coding_system, 0);
5004 }
5005
5006 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5007        3, 3, "r\nzCoding system: ",
5008   "Encode the current region by specified coding system.\n\
5009 When called from a program, takes three arguments:\n\
5010 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5011 This function sets `last-coding-system-used' to the precise coding system\n\
5012 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5013 not fully specified.)\n\
5014 It returns the length of the encoded text.")
5015   (start, end, coding_system)
5016      Lisp_Object start, end, coding_system;
5017 {
5018   return code_convert_region1 (start, end, coding_system, 1);
5019 }
5020
5021 Lisp_Object
5022 code_convert_string1 (string, coding_system, nocopy, encodep)
5023      Lisp_Object string, coding_system, nocopy;
5024      int encodep;
5025 {
5026   struct coding_system coding;
5027
5028   CHECK_STRING (string, 0);
5029   CHECK_SYMBOL (coding_system, 1);
5030
5031   if (NILP (coding_system))
5032     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5033
5034   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5035     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5036
5037   coding.mode |= CODING_MODE_LAST_BLOCK;
5038   Vlast_coding_system_used = coding.symbol;
5039   return code_convert_string (string, &coding, encodep, !NILP (nocopy));
5040 }
5041
5042 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5043        2, 3, 0,
5044   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5045 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5046 if the decoding operation is trivial.\n\
5047 This function sets `last-coding-system-used' to the precise coding system\n\
5048 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5049 not fully specified.)")
5050   (string, coding_system, nocopy)
5051      Lisp_Object string, coding_system, nocopy;
5052 {
5053   return code_convert_string1 (string, coding_system, nocopy, 0);
5054 }
5055
5056 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5057        2, 3, 0,
5058   "Encode STRING to CODING-SYSTEM, and return the result.\n\
5059 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5060 if the encoding operation is trivial.\n\
5061 This function sets `last-coding-system-used' to the precise coding system\n\
5062 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5063 not fully specified.)")
5064   (string, coding_system, nocopy)
5065      Lisp_Object string, coding_system, nocopy;
5066 {
5067   return code_convert_string1 (string, coding_system, nocopy, 1);
5068 }
5069
5070 /* Encode or decode STRING according to CODING_SYSTEM.
5071    Do not set Vlast_coding_system_used.  */
5072
5073 Lisp_Object
5074 code_convert_string_norecord (string, coding_system, encodep)
5075      Lisp_Object string, coding_system;
5076      int encodep;
5077 {
5078   struct coding_system coding;
5079
5080   CHECK_STRING (string, 0);
5081   CHECK_SYMBOL (coding_system, 1);
5082
5083   if (NILP (coding_system))
5084     return string;
5085
5086   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5087     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5088
5089   coding.mode |= CODING_MODE_LAST_BLOCK;
5090   return code_convert_string (string, &coding, encodep, Qt);
5091 }
5092 \f
5093 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5094   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5095 Return the corresponding character.")
5096   (code)
5097      Lisp_Object code;
5098 {
5099   unsigned char c1, c2, s1, s2;
5100   Lisp_Object val;
5101
5102   CHECK_NUMBER (code, 0);
5103   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5104   if (s1 == 0)
5105     {
5106       if (s2 < 0x80)
5107         XSETFASTINT (val, s2);
5108       else if (s2 >= 0xA0 || s2 <= 0xDF)
5109         XSETFASTINT (val,
5110                      MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5111       else
5112         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5113     }
5114   else
5115     {
5116       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5117           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5118         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5119       DECODE_SJIS (s1, s2, c1, c2);
5120       XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5121     }
5122   return val;
5123 }
5124
5125 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5126   "Encode a Japanese character CHAR to shift_jis encoding.\n\
5127 Return the corresponding code in SJIS.")
5128   (ch)
5129      Lisp_Object ch;
5130 {
5131   int charset, c1, c2, s1, s2;
5132   Lisp_Object val;
5133
5134   CHECK_NUMBER (ch, 0);
5135   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5136   if (charset == CHARSET_ASCII)
5137     {
5138       val = ch;
5139     }
5140   else if (charset == charset_jisx0208
5141            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5142     {
5143       ENCODE_SJIS (c1, c2, s1, s2);
5144       XSETFASTINT (val, (s1 << 8) | s2);
5145     }
5146   else if (charset == charset_katakana_jisx0201
5147            && c1 > 0x20 && c2 < 0xE0)
5148     {
5149       XSETFASTINT (val, c1 | 0x80);
5150     }
5151   else
5152     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5153   return val;
5154 }
5155
5156 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5157   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5158 Return the corresponding character.")
5159   (code)
5160      Lisp_Object code;
5161 {
5162   int charset;
5163   unsigned char b1, b2, c1, c2;
5164   Lisp_Object val;
5165
5166   CHECK_NUMBER (code, 0);
5167   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5168   if (b1 == 0)
5169     {
5170       if (b2 >= 0x80)
5171         error ("Invalid BIG5 code: %x", XFASTINT (code));
5172       val = code;
5173     }
5174   else
5175     {
5176       if ((b1 < 0xA1 || b1 > 0xFE)
5177           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5178         error ("Invalid BIG5 code: %x", XFASTINT (code));
5179       DECODE_BIG5 (b1, b2, charset, c1, c2);
5180       XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5181     }
5182   return val;
5183 }
5184
5185 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5186   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5187 Return the corresponding character code in Big5.")
5188   (ch)
5189      Lisp_Object ch;
5190 {
5191   int charset, c1, c2, b1, b2;
5192   Lisp_Object val;
5193
5194   CHECK_NUMBER (ch, 0);
5195   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5196   if (charset == CHARSET_ASCII)
5197     {
5198       val = ch;
5199     }
5200   else if ((charset == charset_big5_1
5201             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5202            || (charset == charset_big5_2
5203                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5204     {
5205       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5206       XSETFASTINT (val, (b1 << 8) | b2);
5207     }
5208   else
5209     error ("Can't encode to Big5: %d", XFASTINT (ch));
5210   return val;
5211 }
5212 \f
5213 DEFUN ("set-terminal-coding-system-internal",
5214        Fset_terminal_coding_system_internal,
5215        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5216   (coding_system)
5217      Lisp_Object coding_system;
5218 {
5219   CHECK_SYMBOL (coding_system, 0);
5220   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5221   /* We had better not send unsafe characters to terminal.  */
5222   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5223
5224   return Qnil;
5225 }
5226
5227 DEFUN ("set-safe-terminal-coding-system-internal",
5228        Fset_safe_terminal_coding_system_internal,
5229        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5230   (coding_system)
5231      Lisp_Object coding_system;
5232 {
5233   CHECK_SYMBOL (coding_system, 0);
5234   setup_coding_system (Fcheck_coding_system (coding_system),
5235                        &safe_terminal_coding);
5236   return Qnil;
5237 }
5238
5239 DEFUN ("terminal-coding-system",
5240        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5241   "Return coding system specified for terminal output.")
5242   ()
5243 {
5244   return terminal_coding.symbol;
5245 }
5246
5247 DEFUN ("set-keyboard-coding-system-internal",
5248        Fset_keyboard_coding_system_internal,
5249        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5250   (coding_system)
5251      Lisp_Object coding_system;
5252 {
5253   CHECK_SYMBOL (coding_system, 0);
5254   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5255   return Qnil;
5256 }
5257
5258 DEFUN ("keyboard-coding-system",
5259        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5260   "Return coding system specified for decoding keyboard input.")
5261   ()
5262 {
5263   return keyboard_coding.symbol;
5264 }
5265
5266 \f
5267 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5268        Sfind_operation_coding_system,  1, MANY, 0,
5269   "Choose a coding system for an operation based on the target name.\n\
5270 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5271 DECODING-SYSTEM is the coding system to use for decoding\n\
5272 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5273 for encoding (in case OPERATION does encoding).\n\
5274 \n\
5275 The first argument OPERATION specifies an I/O primitive:\n\
5276   For file I/O, `insert-file-contents' or `write-region'.\n\
5277   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5278   For network I/O, `open-network-stream'.\n\
5279 \n\
5280 The remaining arguments should be the same arguments that were passed\n\
5281 to the primitive.  Depending on which primitive, one of those arguments\n\
5282 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5283 whichever argument specifies the file name is TARGET.\n\
5284 \n\
5285 TARGET has a meaning which depends on OPERATION:\n\
5286   For file I/O, TARGET is a file name.\n\
5287   For process I/O, TARGET is a process name.\n\
5288   For network I/O, TARGET is a service name or a port number\n\
5289 \n\
5290 This function looks up what specified for TARGET in,\n\
5291 `file-coding-system-alist', `process-coding-system-alist',\n\
5292 or `network-coding-system-alist' depending on OPERATION.\n\
5293 They may specify a coding system, a cons of coding systems,\n\
5294 or a function symbol to call.\n\
5295 In the last case, we call the function with one argument,\n\
5296 which is a list of all the arguments given to this function.")
5297   (nargs, args)
5298      int nargs;
5299      Lisp_Object *args;
5300 {
5301   Lisp_Object operation, target_idx, target, val;
5302   register Lisp_Object chain;
5303
5304   if (nargs < 2)
5305     error ("Too few arguments");
5306   operation = args[0];
5307   if (!SYMBOLP (operation)
5308       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5309     error ("Invalid first arguement");
5310   if (nargs < 1 + XINT (target_idx))
5311     error ("Too few arguments for operation: %s",
5312            XSYMBOL (operation)->name->data);
5313   target = args[XINT (target_idx) + 1];
5314   if (!(STRINGP (target)
5315         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5316     error ("Invalid %dth argument", XINT (target_idx) + 1);
5317
5318   chain = ((EQ (operation, Qinsert_file_contents)
5319             || EQ (operation, Qwrite_region))
5320            ? Vfile_coding_system_alist
5321            : (EQ (operation, Qopen_network_stream)
5322               ? Vnetwork_coding_system_alist
5323               : Vprocess_coding_system_alist));
5324   if (NILP (chain))
5325     return Qnil;
5326
5327   for (; CONSP (chain); chain = XCDR (chain))
5328     {
5329       Lisp_Object elt;
5330       elt = XCAR (chain);
5331
5332       if (CONSP (elt)
5333           && ((STRINGP (target)
5334                && STRINGP (XCAR (elt))
5335                && fast_string_match (XCAR (elt), target) >= 0)
5336               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5337         {
5338           val = XCDR (elt);
5339           /* Here, if VAL is both a valid coding system and a valid
5340              function symbol, we return VAL as a coding system.  */
5341           if (CONSP (val))
5342             return val;
5343           if (! SYMBOLP (val))
5344             return Qnil;
5345           if (! NILP (Fcoding_system_p (val)))
5346             return Fcons (val, val);
5347           if (! NILP (Ffboundp (val)))
5348             {
5349               val = call1 (val, Flist (nargs, args));
5350               if (CONSP (val))
5351                 return val;
5352               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5353                 return Fcons (val, val);
5354             }
5355           return Qnil;
5356         }
5357     }
5358   return Qnil;
5359 }
5360
5361 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5362        Supdate_coding_systems_internal, 0, 0, 0,
5363   "Update internal database for ISO2022 and CCL based coding systems.\n\
5364 When values of the following coding categories are changed, you must\n\
5365 call this function:\n\
5366   coding-category-iso-7, coding-category-iso-7-tight,\n\
5367   coding-category-iso-8-1, coding-category-iso-8-2,\n\
5368   coding-category-iso-7-else, coding-category-iso-8-else,\n\
5369   coding-category-ccl")
5370   ()
5371 {
5372   int i;
5373
5374   for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
5375     {
5376       Lisp_Object val;
5377
5378       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5379       if (!NILP (val))
5380         {
5381           if (! coding_system_table[i])
5382             coding_system_table[i] = ((struct coding_system *)
5383                                       xmalloc (sizeof (struct coding_system)));
5384           setup_coding_system (val, coding_system_table[i]);
5385         }
5386       else if (coding_system_table[i])
5387         {
5388           xfree (coding_system_table[i]);
5389           coding_system_table[i] = NULL;
5390         }
5391     }
5392
5393   return Qnil;
5394 }
5395
5396 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5397        Sset_coding_priority_internal, 0, 0, 0,
5398   "Update internal database for the current value of `coding-category-list'.\n\
5399 This function is internal use only.")
5400   ()
5401 {
5402   int i = 0, idx;
5403   Lisp_Object val;
5404
5405   val = Vcoding_category_list;
5406
5407   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5408     {
5409       if (! SYMBOLP (XCAR (val)))
5410         break;
5411       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5412       if (idx >= CODING_CATEGORY_IDX_MAX)
5413         break;
5414       coding_priorities[i++] = (1 << idx);
5415       val = XCDR (val);
5416     }
5417   /* If coding-category-list is valid and contains all coding
5418      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
5419      the following code saves Emacs from craching.  */
5420   while (i < CODING_CATEGORY_IDX_MAX)
5421     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5422
5423   return Qnil;
5424 }
5425
5426 #endif /* emacs */
5427
5428 \f
5429 /*** 9. Post-amble ***/
5430
5431 void
5432 init_coding ()
5433 {
5434   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5435 }
5436
5437 void
5438 init_coding_once ()
5439 {
5440   int i;
5441
5442   /* Emacs' internal format specific initialize routine.  */
5443   for (i = 0; i <= 0x20; i++)
5444     emacs_code_class[i] = EMACS_control_code;
5445   emacs_code_class[0x0A] = EMACS_linefeed_code;
5446   emacs_code_class[0x0D] = EMACS_carriage_return_code;
5447   for (i = 0x21 ; i < 0x7F; i++)
5448     emacs_code_class[i] = EMACS_ascii_code;
5449   emacs_code_class[0x7F] = EMACS_control_code;
5450   emacs_code_class[0x80] = EMACS_leading_code_composition;
5451   for (i = 0x81; i < 0xFF; i++)
5452     emacs_code_class[i] = EMACS_invalid_code;
5453   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5454   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5455   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5456   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5457
5458   /* ISO2022 specific initialize routine.  */
5459   for (i = 0; i < 0x20; i++)
5460     iso_code_class[i] = ISO_control_code;
5461   for (i = 0x21; i < 0x7F; i++)
5462     iso_code_class[i] = ISO_graphic_plane_0;
5463   for (i = 0x80; i < 0xA0; i++)
5464     iso_code_class[i] = ISO_control_code;
5465   for (i = 0xA1; i < 0xFF; i++)
5466     iso_code_class[i] = ISO_graphic_plane_1;
5467   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5468   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5469   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5470   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5471   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5472   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5473   iso_code_class[ISO_CODE_ESC] = ISO_escape;
5474   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5475   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5476   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5477
5478   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
5479
5480   setup_coding_system (Qnil, &keyboard_coding);
5481   setup_coding_system (Qnil, &terminal_coding);
5482   setup_coding_system (Qnil, &safe_terminal_coding);
5483   setup_coding_system (Qnil, &default_buffer_file_coding);
5484
5485   bzero (coding_system_table, sizeof coding_system_table);
5486
5487   bzero (ascii_skip_code, sizeof ascii_skip_code);
5488   for (i = 0; i < 128; i++)
5489     ascii_skip_code[i] = 1;
5490
5491 #if defined (MSDOS) || defined (WINDOWSNT)
5492   system_eol_type = CODING_EOL_CRLF;
5493 #else
5494   system_eol_type = CODING_EOL_LF;
5495 #endif
5496 }
5497
5498 #ifdef emacs
5499
5500 void
5501 syms_of_coding ()
5502 {
5503   Qtarget_idx = intern ("target-idx");
5504   staticpro (&Qtarget_idx);
5505
5506   Qcoding_system_history = intern ("coding-system-history");
5507   staticpro (&Qcoding_system_history);
5508   Fset (Qcoding_system_history, Qnil);
5509
5510   /* Target FILENAME is the first argument.  */
5511   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
5512   /* Target FILENAME is the third argument.  */
5513   Fput (Qwrite_region, Qtarget_idx, make_number (2));
5514
5515   Qcall_process = intern ("call-process");
5516   staticpro (&Qcall_process);
5517   /* Target PROGRAM is the first argument.  */
5518   Fput (Qcall_process, Qtarget_idx, make_number (0));
5519
5520   Qcall_process_region = intern ("call-process-region");
5521   staticpro (&Qcall_process_region);
5522   /* Target PROGRAM is the third argument.  */
5523   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5524
5525   Qstart_process = intern ("start-process");
5526   staticpro (&Qstart_process);
5527   /* Target PROGRAM is the third argument.  */
5528   Fput (Qstart_process, Qtarget_idx, make_number (2));
5529
5530   Qopen_network_stream = intern ("open-network-stream");
5531   staticpro (&Qopen_network_stream);
5532   /* Target SERVICE is the fourth argument.  */
5533   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5534
5535   Qcoding_system = intern ("coding-system");
5536   staticpro (&Qcoding_system);
5537
5538   Qeol_type = intern ("eol-type");
5539   staticpro (&Qeol_type);
5540
5541   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5542   staticpro (&Qbuffer_file_coding_system);
5543
5544   Qpost_read_conversion = intern ("post-read-conversion");
5545   staticpro (&Qpost_read_conversion);
5546
5547   Qpre_write_conversion = intern ("pre-write-conversion");
5548   staticpro (&Qpre_write_conversion);
5549
5550   Qno_conversion = intern ("no-conversion");
5551   staticpro (&Qno_conversion);
5552
5553   Qundecided = intern ("undecided");
5554   staticpro (&Qundecided);
5555
5556   Qcoding_system_p = intern ("coding-system-p");
5557   staticpro (&Qcoding_system_p);
5558
5559   Qcoding_system_error = intern ("coding-system-error");
5560   staticpro (&Qcoding_system_error);
5561
5562   Fput (Qcoding_system_error, Qerror_conditions,
5563         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5564   Fput (Qcoding_system_error, Qerror_message,
5565         build_string ("Invalid coding system"));
5566
5567   Qcoding_category = intern ("coding-category");
5568   staticpro (&Qcoding_category);
5569   Qcoding_category_index = intern ("coding-category-index");
5570   staticpro (&Qcoding_category_index);
5571
5572   Vcoding_category_table
5573     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5574   staticpro (&Vcoding_category_table);
5575   {
5576     int i;
5577     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5578       {
5579         XVECTOR (Vcoding_category_table)->contents[i]
5580           = intern (coding_category_name[i]);
5581         Fput (XVECTOR (Vcoding_category_table)->contents[i],
5582               Qcoding_category_index, make_number (i));
5583       }
5584   }
5585
5586   Qtranslation_table = intern ("translation-table");
5587   staticpro (&Qtranslation_table);
5588   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
5589
5590   Qtranslation_table_id = intern ("translation-table-id");
5591   staticpro (&Qtranslation_table_id);
5592
5593   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5594   staticpro (&Qtranslation_table_for_decode);
5595
5596   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5597   staticpro (&Qtranslation_table_for_encode);
5598
5599   Qsafe_charsets = intern ("safe-charsets");
5600   staticpro (&Qsafe_charsets);
5601
5602   Qvalid_codes = intern ("valid-codes");
5603   staticpro (&Qvalid_codes);
5604
5605   Qemacs_mule = intern ("emacs-mule");
5606   staticpro (&Qemacs_mule);
5607
5608   Qraw_text = intern ("raw-text");
5609   staticpro (&Qraw_text);
5610
5611   defsubr (&Scoding_system_p);
5612   defsubr (&Sread_coding_system);
5613   defsubr (&Sread_non_nil_coding_system);
5614   defsubr (&Scheck_coding_system);
5615   defsubr (&Sdetect_coding_region);
5616   defsubr (&Sdetect_coding_string);
5617   defsubr (&Sdecode_coding_region);
5618   defsubr (&Sencode_coding_region);
5619   defsubr (&Sdecode_coding_string);
5620   defsubr (&Sencode_coding_string);
5621   defsubr (&Sdecode_sjis_char);
5622   defsubr (&Sencode_sjis_char);
5623   defsubr (&Sdecode_big5_char);
5624   defsubr (&Sencode_big5_char);
5625   defsubr (&Sset_terminal_coding_system_internal);
5626   defsubr (&Sset_safe_terminal_coding_system_internal);
5627   defsubr (&Sterminal_coding_system);
5628   defsubr (&Sset_keyboard_coding_system_internal);
5629   defsubr (&Skeyboard_coding_system);
5630   defsubr (&Sfind_operation_coding_system);
5631   defsubr (&Supdate_coding_systems_internal);
5632   defsubr (&Sset_coding_priority_internal);
5633
5634   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5635     "List of coding systems.\n\
5636 \n\
5637 Do not alter the value of this variable manually.  This variable should be\n\
5638 updated by the functions `make-coding-system' and\n\
5639 `define-coding-system-alias'.");
5640   Vcoding_system_list = Qnil;
5641
5642   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5643     "Alist of coding system names.\n\
5644 Each element is one element list of coding system name.\n\
5645 This variable is given to `completing-read' as TABLE argument.\n\
5646 \n\
5647 Do not alter the value of this variable manually.  This variable should be\n\
5648 updated by the functions `make-coding-system' and\n\
5649 `define-coding-system-alias'.");
5650   Vcoding_system_alist = Qnil;
5651
5652   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5653     "List of coding-categories (symbols) ordered by priority.");
5654   {
5655     int i;
5656
5657     Vcoding_category_list = Qnil;
5658     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5659       Vcoding_category_list
5660         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5661                  Vcoding_category_list);
5662   }
5663
5664   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
5665     "Specify the coding system for read operations.\n\
5666 It is useful to bind this variable with `let', but do not set it globally.\n\
5667 If the value is a coding system, it is used for decoding on read operation.\n\
5668 If not, an appropriate element is used from one of the coding system alists:\n\
5669 There are three such tables, `file-coding-system-alist',\n\
5670 `process-coding-system-alist', and `network-coding-system-alist'.");
5671   Vcoding_system_for_read = Qnil;
5672
5673   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
5674     "Specify the coding system for write operations.\n\
5675 Programs bind this variable with `let', but you should not set it globally.\n\
5676 If the value is a coding system, it is used for encoding of output,\n\
5677 when writing it to a file and when sending it to a file or subprocess.\n\
5678 \n\
5679 If this does not specify a coding system, an appropriate element\n\
5680 is used from one of the coding system alists:\n\
5681 There are three such tables, `file-coding-system-alist',\n\
5682 `process-coding-system-alist', and `network-coding-system-alist'.\n\
5683 For output to files, if the above procedure does not specify a coding system,\n\
5684 the value of `buffer-file-coding-system' is used.");
5685   Vcoding_system_for_write = Qnil;
5686
5687   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
5688     "Coding system used in the latest file or process I/O.");
5689   Vlast_coding_system_used = Qnil;
5690
5691   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
5692     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
5693 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5694 such conversion.");
5695   inhibit_eol_conversion = 0;
5696
5697   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5698     "Non-nil means process buffer inherits coding system of process output.\n\
5699 Bind it to t if the process output is to be treated as if it were a file\n\
5700 read from some filesystem.");
5701   inherit_process_coding_system = 0;
5702
5703   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5704     "Alist to decide a coding system to use for a file I/O operation.\n\
5705 The format is ((PATTERN . VAL) ...),\n\
5706 where PATTERN is a regular expression matching a file name,\n\
5707 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5708 If VAL is a coding system, it is used for both decoding and encoding\n\
5709 the file contents.\n\
5710 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5711 and the cdr part is used for encoding.\n\
5712 If VAL is a function symbol, the function must return a coding system\n\
5713 or a cons of coding systems which are used as above.\n\
5714 \n\
5715 See also the function `find-operation-coding-system'\n\
5716 and the variable `auto-coding-alist'.");
5717   Vfile_coding_system_alist = Qnil;
5718
5719   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5720     "Alist to decide a coding system to use for a process I/O operation.\n\
5721 The format is ((PATTERN . VAL) ...),\n\
5722 where PATTERN is a regular expression matching a program name,\n\
5723 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5724 If VAL is a coding system, it is used for both decoding what received\n\
5725 from the program and encoding what sent to the program.\n\
5726 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5727 and the cdr part is used for encoding.\n\
5728 If VAL is a function symbol, the function must return a coding system\n\
5729 or a cons of coding systems which are used as above.\n\
5730 \n\
5731 See also the function `find-operation-coding-system'.");
5732   Vprocess_coding_system_alist = Qnil;
5733
5734   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5735     "Alist to decide a coding system to use for a network I/O operation.\n\
5736 The format is ((PATTERN . VAL) ...),\n\
5737 where PATTERN is a regular expression matching a network service name\n\
5738 or is a port number to connect to,\n\
5739 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5740 If VAL is a coding system, it is used for both decoding what received\n\
5741 from the network stream and encoding what sent to the network stream.\n\
5742 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5743 and the cdr part is used for encoding.\n\
5744 If VAL is a function symbol, the function must return a coding system\n\
5745 or a cons of coding systems which are used as above.\n\
5746 \n\
5747 See also the function `find-operation-coding-system'.");
5748   Vnetwork_coding_system_alist = Qnil;
5749
5750   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5751     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5752   eol_mnemonic_unix = build_string (":");
5753
5754   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5755     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5756   eol_mnemonic_dos = build_string ("\\");
5757
5758   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5759     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5760   eol_mnemonic_mac = build_string ("/");
5761
5762   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5763     "*String displayed in mode line when end-of-line format is not yet determined.");
5764   eol_mnemonic_undecided = build_string (":");
5765
5766   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
5767     "*Non-nil enables character translation while encoding and decoding.");
5768   Venable_character_translation = Qt;
5769
5770   DEFVAR_LISP ("standard-translation-table-for-decode",
5771     &Vstandard_translation_table_for_decode,
5772     "Table for translating characters while decoding.");
5773   Vstandard_translation_table_for_decode = Qnil;
5774
5775   DEFVAR_LISP ("standard-translation-table-for-encode",
5776     &Vstandard_translation_table_for_encode,
5777     "Table for translationg characters while encoding.");
5778   Vstandard_translation_table_for_encode = Qnil;
5779
5780   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5781     "Alist of charsets vs revision numbers.\n\
5782 While encoding, if a charset (car part of an element) is found,\n\
5783 designate it with the escape sequence identifing revision (cdr part of the element).");
5784   Vcharset_revision_alist = Qnil;
5785
5786   DEFVAR_LISP ("default-process-coding-system",
5787                &Vdefault_process_coding_system,
5788     "Cons of coding systems used for process I/O by default.\n\
5789 The car part is used for decoding a process output,\n\
5790 the cdr part is used for encoding a text to be sent to a process.");
5791   Vdefault_process_coding_system = Qnil;
5792
5793   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5794     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5795 This is a vector of length 256.\n\
5796 If Nth element is non-nil, the existence of code N in a file\n\
5797 \(or output of subprocess) doesn't prevent it to be detected as\n\
5798 a coding system of ISO 2022 variant which has a flag\n\
5799 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5800 or reading output of a subprocess.\n\
5801 Only 128th through 159th elements has a meaning.");
5802   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
5803
5804   DEFVAR_LISP ("select-safe-coding-system-function",
5805                &Vselect_safe_coding_system_function,
5806     "Function to call to select safe coding system for encoding a text.\n\
5807 \n\
5808 If set, this function is called to force a user to select a proper\n\
5809 coding system which can encode the text in the case that a default\n\
5810 coding system used in each operation can't encode the text.\n\
5811 \n\
5812 The default value is `select-safe-coding-system' (which see).");
5813   Vselect_safe_coding_system_function = Qnil;
5814
5815 }
5816
5817 #endif /* emacs */