src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998, 2002 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001,2002  Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 158
 159   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 160   internal multibyte format to CODING.  The resulting unibyte text
 161   goes to a place pointed to by DESTINATION, the length of which
 162   should not exceed DST_BYTES.
 163
 164   These functions set the information about original and encoded texts
 165   in the members `produced', `produced_char', `consumed', and
 166   `consumed_char' of the structure *CODING.  They also set the member
 167   `result' to one of CODING_FINISH_XXX indicating how the encoding
 168   finished.
 169
 170   DST_BYTES zero means that the source area and destination area are
 171   overlapped, which means that we can produce encoded text until it
 172   reaches at the head of the not-yet-encoded source text.
 173
 174   Below is a template for these functions.  */
 175 #if 0
 176 static void
 177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 178      struct coding_system *coding;
 179      unsigned char *source, *destination;
 180      int src_bytes, dst_bytes;
 181 {
 182   ...
 183 }
 184 #endif
 185
 186 /*** COMMONLY USED MACROS ***/
 187
 188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 189    get one, two, and three bytes from the source text respectively.
 190    If there are not enough bytes in the source, they jump to
 191    `label_end_of_loop'.  The caller should set variables `coding',
 192    `src' and `src_end' to appropriate pointer in advance.  These
 193    macros are called from decoding routines `decode_coding_XXX', thus
 194    it is assumed that the source text is unibyte.  */
 195
 196 #define ONE_MORE_BYTE(c1)                                       \
 197   do {                                                          \
 198     if (src >= src_end)                                         \
 199       {                                                         \
 200         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 201         goto label_end_of_loop;                                 \
 202       }                                                         \
 203     c1 = *src++;                                                \
 204   } while (0)
 205
 206 #define TWO_MORE_BYTES(c1, c2)                                  \
 207   do {                                                          \
 208     if (src + 1 >= src_end)                                     \
 209       {                                                         \
 210         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 211         goto label_end_of_loop;                                 \
 212       }                                                         \
 213     c1 = *src++;                                                \
 214     c2 = *src++;                                                \
 215   } while (0)
 216
 217
 218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 219    form if MULTIBYTEP is nonzero.  */
 220
 221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 222   do {                                                          \
 223     if (src >= src_end)                                         \
 224       {                                                         \
 225         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 226         goto label_end_of_loop;                                 \
 227       }                                                         \
 228     c1 = *src++;                                                \
 229     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 230       c1 = *src++ - 0x20;                                       \
 231   } while (0)
 232
 233 /* Set C to the next character at the source text pointed by `src'.
 234    If there are not enough characters in the source, jump to
 235    `label_end_of_loop'.  The caller should set variables `coding'
 236    `src', `src_end', and `translation_table' to appropriate pointers
 237    in advance.  This macro is used in encoding routines
 238    `encode_coding_XXX', thus it assumes that the source text is in
 239    multibyte form except for 8-bit characters.  8-bit characters are
 240    in multibyte form if coding->src_multibyte is nonzero, else they
 241    are represented by a single byte.  */
 242
 243 #define ONE_MORE_CHAR(c)                                        \
 244   do {                                                          \
 245     int len = src_end - src;                                    \
 246     int bytes;                                                  \
 247     if (len <= 0)                                               \
 248       {                                                         \
 249         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 250         goto label_end_of_loop;                                 \
 251       }                                                         \
 252     if (coding->src_multibyte                                   \
 253         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 254       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 255     else                                                        \
 256       c = *src, bytes = 1;                                      \
 257     if (!NILP (translation_table))                              \
 258       c = translate_char (translation_table, c, -1, 0, 0);      \
 259     src += bytes;                                               \
 260   } while (0)
 261
 262
 263 /* Produce a multibyte form of character C to `dst'.  Jump to
 264    `label_end_of_loop' if there's not enough space at `dst'.
 265
 266    If we are now in the middle of a composition sequence, the decoded
 267    character may be ALTCHAR (for the current composition).  In that
 268    case, the character goes to coding->cmp_data->data instead of
 269    `dst'.
 270
 271    This macro is used in decoding routines.  */
 272
 273 #define EMIT_CHAR(c)                                                    \
 274   do {                                                                  \
 275     if (! COMPOSING_P (coding)                                          \
 276         || coding->composing == COMPOSITION_RELATIVE                    \
 277         || coding->composing == COMPOSITION_WITH_RULE)                  \
 278       {                                                                 \
 279         int bytes = CHAR_BYTES (c);                                     \
 280         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 281           {                                                             \
 282             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 283             goto label_end_of_loop;                                     \
 284           }                                                             \
 285         dst += CHAR_STRING (c, dst);                                    \
 286         coding->produced_char++;                                        \
 287       }                                                                 \
 288                                                                         \
 289     if (COMPOSING_P (coding)                                            \
 290         && coding->composing != COMPOSITION_RELATIVE)                   \
 291       {                                                                 \
 292         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 293         coding->composition_rule_follows                                \
 294           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 295       }                                                                 \
 296   } while (0)
 297
 298
 299 #define EMIT_ONE_BYTE(c)                                        \
 300   do {                                                          \
 301     if (dst >= (dst_bytes ? dst_end : src))                     \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     *dst++ = c;                                                 \
 307   } while (0)
 308
 309 #define EMIT_TWO_BYTES(c1, c2)                                  \
 310   do {                                                          \
 311     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 312       {                                                         \
 313         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 314         goto label_end_of_loop;                                 \
 315       }                                                         \
 316     *dst++ = c1, *dst++ = c2;                                   \
 317   } while (0)
 318
 319 #define EMIT_BYTES(from, to)                                    \
 320   do {                                                          \
 321     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 322       {                                                         \
 323         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 324         goto label_end_of_loop;                                 \
 325       }                                                         \
 326     while (from < to)                                           \
 327       *dst++ = *from++;                                         \
 328   } while (0)
 329
 330 \f
 331 /*** 1. Preamble ***/
 332
 333 #ifdef emacs
 334 #include <config.h>
 335 #endif
 336
 337 #include <stdio.h>
 338
 339 #ifdef emacs
 340
 341 #include "lisp.h"
 342 #include "buffer.h"
 343 #include "charset.h"
 344 #include "composite.h"
 345 #include "ccl.h"
 346 #include "coding.h"
 347 #include "window.h"
 348
 349 #else  /* not emacs */
 350
 351 #include "mulelib.h"
 352
 353 #endif /* not emacs */
 354
 355 Lisp_Object Qcoding_system, Qeol_type;
 356 Lisp_Object Qbuffer_file_coding_system;
 357 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 358 Lisp_Object Qno_conversion, Qundecided;
 359 Lisp_Object Qcoding_system_history;
 360 Lisp_Object Qsafe_chars;
 361 Lisp_Object Qvalid_codes;
 362
 363 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 364 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 365 Lisp_Object Qstart_process, Qopen_network_stream;
 366 Lisp_Object Qtarget_idx;
 367
 368 Lisp_Object Vselect_safe_coding_system_function;
 369
 370 /* Mnemonic string for each format of end-of-line.  */
 371 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 372 /* Mnemonic string to indicate format of end-of-line is not yet
 373    decided.  */
 374 Lisp_Object eol_mnemonic_undecided;
 375
 376 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 377    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 378 int system_eol_type;
 379
 380 #ifdef emacs
 381
 382 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 383
 384 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 385
 386 /* Coding system emacs-mule and raw-text are for converting only
 387    end-of-line format.  */
 388 Lisp_Object Qemacs_mule, Qraw_text;
 389
 390 /* Coding-systems are handed between Emacs Lisp programs and C internal
 391    routines by the following three variables.  */
 392 /* Coding-system for reading files and receiving data from process.  */
 393 Lisp_Object Vcoding_system_for_read;
 394 /* Coding-system for writing files and sending data to process.  */
 395 Lisp_Object Vcoding_system_for_write;
 396 /* Coding-system actually used in the latest I/O.  */
 397 Lisp_Object Vlast_coding_system_used;
 398
 399 /* A vector of length 256 which contains information about special
 400    Latin codes (especially for dealing with Microsoft codes).  */
 401 Lisp_Object Vlatin_extra_code_table;
 402
 403 /* Flag to inhibit code conversion of end-of-line format.  */
 404 int inhibit_eol_conversion;
 405
 406 /* Flag to inhibit ISO2022 escape sequence detection.  */
 407 int inhibit_iso_escape_detection;
 408
 409 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 410 int inherit_process_coding_system;
 411
 412 /* Coding system to be used to encode text for terminal display.  */
 413 struct coding_system terminal_coding;
 414
 415 /* Coding system to be used to encode text for terminal display when
 416    terminal coding system is nil.  */
 417 struct coding_system safe_terminal_coding;
 418
 419 /* Coding system of what is sent from terminal keyboard.  */
 420 struct coding_system keyboard_coding;
 421
 422 /* Default coding system to be used to write a file.  */
 423 struct coding_system default_buffer_file_coding;
 424
 425 Lisp_Object Vfile_coding_system_alist;
 426 Lisp_Object Vprocess_coding_system_alist;
 427 Lisp_Object Vnetwork_coding_system_alist;
 428
 429 Lisp_Object Vlocale_coding_system;
 430
 431 #endif /* emacs */
 432
 433 Lisp_Object Qcoding_category, Qcoding_category_index;
 434
 435 /* List of symbols `coding-category-xxx' ordered by priority.  */
 436 Lisp_Object Vcoding_category_list;
 437
 438 /* Table of coding categories (Lisp symbols).  */
 439 Lisp_Object Vcoding_category_table;
 440
 441 /* Table of names of symbol for each coding-category.  */
 442 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 443   "coding-category-emacs-mule",
 444   "coding-category-sjis",
 445   "coding-category-iso-7",
 446   "coding-category-iso-7-tight",
 447   "coding-category-iso-8-1",
 448   "coding-category-iso-8-2",
 449   "coding-category-iso-7-else",
 450   "coding-category-iso-8-else",
 451   "coding-category-ccl",
 452   "coding-category-big5",
 453   "coding-category-utf-8",
 454   "coding-category-utf-16-be",
 455   "coding-category-utf-16-le",
 456   "coding-category-raw-text",
 457   "coding-category-binary"
 458 };
 459
 460 /* Table of pointers to coding systems corresponding to each coding
 461    categories.  */
 462 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 463
 464 /* Table of coding category masks.  Nth element is a mask for a coding
 465    category of which priority is Nth.  */
 466 static
 467 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 468
 469 /* Flag to tell if we look up translation table on character code
 470    conversion.  */
 471 Lisp_Object Venable_character_translation;
 472 /* Standard translation table to look up on decoding (reading).  */
 473 Lisp_Object Vstandard_translation_table_for_decode;
 474 /* Standard translation table to look up on encoding (writing).  */
 475 Lisp_Object Vstandard_translation_table_for_encode;
 476
 477 Lisp_Object Qtranslation_table;
 478 Lisp_Object Qtranslation_table_id;
 479 Lisp_Object Qtranslation_table_for_decode;
 480 Lisp_Object Qtranslation_table_for_encode;
 481
 482 /* Alist of charsets vs revision number.  */
 483 Lisp_Object Vcharset_revision_alist;
 484
 485 /* Default coding systems used for process I/O.  */
 486 Lisp_Object Vdefault_process_coding_system;
 487
 488 /* Global flag to tell that we can't call post-read-conversion and
 489    pre-write-conversion functions.  Usually the value is zero, but it
 490    is set to 1 temporarily while such functions are running.  This is
 491    to avoid infinite recursive call.  */
 492 static int inhibit_pre_post_conversion;
 493
 494 /* Char-table containing safe coding systems of each character.  */
 495 Lisp_Object Vchar_coding_system_table;
 496 Lisp_Object Qchar_coding_system;
 497
 498 /* Return `safe-chars' property of coding system CODING.  Don't check
 499    validity of CODING.  */
 500
 501 Lisp_Object
 502 coding_safe_chars (coding)
 503      struct coding_system *coding;
 504 {
 505   Lisp_Object coding_spec, plist, safe_chars;
 506
 507   coding_spec = Fget (coding->symbol, Qcoding_system);
 508   plist = XVECTOR (coding_spec)->contents[3];
 509   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 510   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 511 }
 512
 513 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 514   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 515
 516 \f
 517 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 518
 519 /* Emacs' internal format for representation of multiple character
 520    sets is a kind of multi-byte encoding, i.e. characters are
 521    represented by variable-length sequences of one-byte codes.
 522
 523    ASCII characters and control characters (e.g. `tab', `newline') are
 524    represented by one-byte sequences which are their ASCII codes, in
 525    the range 0x00 through 0x7F.
 526
 527    8-bit characters of the range 0x80..0x9F are represented by
 528    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 529    code + 0x20).
 530
 531    8-bit characters of the range 0xA0..0xFF are represented by
 532    one-byte sequences which are their 8-bit code.
 533
 534    The other characters are represented by a sequence of `base
 535    leading-code', optional `extended leading-code', and one or two
 536    `position-code's.  The length of the sequence is determined by the
 537    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 538    whereas extended leading-code and position-code take the range 0xA0
 539    through 0xFF.  See `charset.h' for more details about leading-code
 540    and position-code.
 541
 542    --- CODE RANGE of Emacs' internal format ---
 543    character set        range
 544    -------------        -----
 545    ascii                0x00..0x7F
 546    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 547    eight-bit-graphic    0xA0..0xBF
 548    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 549    ---------------------------------------------
 550
 551    As this is the internal character representation, the format is
 552    usually not used externally (i.e. in a file or in a data sent to a
 553    process).  But, it is possible to have a text externally in this
 554    format (i.e. by encoding by the coding system `emacs-mule').
 555
 556    In that case, a sequence of one-byte codes has a slightly different
 557    form.
 558
 559    Firstly, all characters in eight-bit-control are represented by
 560    one-byte sequences which are their 8-bit code.
 561
 562    Next, character composition data are represented by the byte
 563    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 564    where,
 565         METHOD is 0xF0 plus one of composition method (enum
 566         composition_method),
 567
 568         BYTES is 0xA0 plus the byte length of these composition data,
 569
 570         CHARS is 0xA0 plus the number of characters composed by these
 571         data,
 572
 573         COMPONENTs are characters of multibyte form or composition
 574         rules encoded by two-byte of ASCII codes.
 575
 576    In addition, for backward compatibility, the following formats are
 577    also recognized as composition data on decoding.
 578
 579    0x80 MSEQ ...
 580    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 581
 582    Here,
 583         MSEQ is a multibyte form but in these special format:
 584           ASCII: 0xA0 ASCII_CODE+0x80,
 585           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 586         RULE is a one byte code of the range 0xA0..0xF0 that
 587         represents a composition rule.
 588   */
 589
 590 enum emacs_code_class_type emacs_code_class[256];
 591
 592 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 593    Check if a text is encoded in Emacs' internal format.  If it is,
 594    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 595
 596 static int
 597 detect_coding_emacs_mule (src, src_end, multibytep)
 598       unsigned char *src, *src_end;
 599       int multibytep;
 600 {
 601   unsigned char c;
 602   int composing = 0;
 603   /* Dummy for ONE_MORE_BYTE.  */
 604   struct coding_system dummy_coding;
 605   struct coding_system *coding = &dummy_coding;
 606
 607   while (1)
 608     {
 609       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 610
 611       if (composing)
 612         {
 613           if (c < 0xA0)
 614             composing = 0;
 615           else if (c == 0xA0)
 616             {
 617               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 618               c &= 0x7F;
 619             }
 620           else
 621             c -= 0x20;
 622         }
 623
 624       if (c < 0x20)
 625         {
 626           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 627             return 0;
 628         }
 629       else if (c >= 0x80 && c < 0xA0)
 630         {
 631           if (c == 0x80)
 632             /* Old leading code for a composite character.  */
 633             composing = 1;
 634           else
 635             {
 636               unsigned char *src_base = src - 1;
 637               int bytes;
 638
 639               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 640                                                bytes))
 641                 return 0;
 642               src = src_base + bytes;
 643             }
 644         }
 645     }
 646  label_end_of_loop:
 647   return CODING_CATEGORY_MASK_EMACS_MULE;
 648 }
 649
 650
 651 /* Record the starting position START and METHOD of one composition.  */
 652
 653 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 654   do {                                                          \
 655     struct composition_data *cmp_data = coding->cmp_data;       \
 656     int *data = cmp_data->data + cmp_data->used;                \
 657     coding->cmp_data_start = cmp_data->used;                    \
 658     data[0] = -1;                                               \
 659     data[1] = cmp_data->char_offset + start;                    \
 660     data[3] = (int) method;                                     \
 661     cmp_data->used += 4;                                        \
 662   } while (0)
 663
 664 /* Record the ending position END of the current composition.  */
 665
 666 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 667   do {                                                          \
 668     struct composition_data *cmp_data = coding->cmp_data;       \
 669     int *data = cmp_data->data + coding->cmp_data_start;        \
 670     data[0] = cmp_data->used - coding->cmp_data_start;          \
 671     data[2] = cmp_data->char_offset + end;                      \
 672   } while (0)
 673
 674 /* Record one COMPONENT (alternate character or composition rule).  */
 675
 676 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
 677   (coding->cmp_data->data[coding->cmp_data->used++] = component)
 678
 679
 680 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 681    is not less than SRC_END, return -1 without incrementing Src.  */
 682
 683 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 684
 685
 686 /* Decode a character represented as a component of composition
 687    sequence of Emacs 20 style at SRC.  Set C to that character, store
 688    its multibyte form sequence at P, and set P to the end of that
 689    sequence.  If no valid character is found, set C to -1.  */
 690
 691 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 692   do {                                                          \
 693     int bytes;                                                  \
 694                                                                 \
 695     c = SAFE_ONE_MORE_BYTE ();                                  \
 696     if (c < 0)                                                  \
 697       break;                                                    \
 698     if (CHAR_HEAD_P (c))                                        \
 699       c = -1;                                                   \
 700     else if (c == 0xA0)                                         \
 701       {                                                         \
 702         c = SAFE_ONE_MORE_BYTE ();                              \
 703         if (c < 0xA0)                                           \
 704           c = -1;                                               \
 705         else                                                    \
 706           {                                                     \
 707             c -= 0xA0;                                          \
 708             *p++ = c;                                           \
 709           }                                                     \
 710       }                                                         \
 711     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 712       {                                                         \
 713         unsigned char *p0 = p;                                  \
 714                                                                 \
 715         c -= 0x20;                                              \
 716         *p++ = c;                                               \
 717         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 718         while (--bytes)                                         \
 719           {                                                     \
 720             c = SAFE_ONE_MORE_BYTE ();                          \
 721             if (c < 0)                                          \
 722               break;                                            \
 723             *p++ = c;                                           \
 724           }                                                     \
 725         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes))     \
 726           c = STRING_CHAR (p0, bytes);                          \
 727         else                                                    \
 728           c = -1;                                               \
 729       }                                                         \
 730     else                                                        \
 731       c = -1;                                                   \
 732   } while (0)
 733
 734
 735 /* Decode a composition rule represented as a component of composition
 736    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 737    valid rule is found, set C to -1.  */
 738
 739 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 740   do {                                                  \
 741     c = SAFE_ONE_MORE_BYTE ();                          \
 742     c -= 0xA0;                                          \
 743     if (c < 0 || c >= 81)                               \
 744       c = -1;                                           \
 745     else                                                \
 746       {                                                 \
 747         gref = c / 9, nref = c % 9;                     \
 748         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 749       }                                                 \
 750   } while (0)
 751
 752
 753 /* Decode composition sequence encoded by `emacs-mule' at the source
 754    pointed by SRC.  SRC_END is the end of source.  Store information
 755    of the composition in CODING->cmp_data.
 756
 757    For backward compatibility, decode also a composition sequence of
 758    Emacs 20 style.  In that case, the composition sequence contains
 759    characters that should be extracted into a buffer or string.  Store
 760    those characters at *DESTINATION in multibyte form.
 761
 762    If we encounter an invalid byte sequence, return 0.
 763    If we encounter an insufficient source or destination, or
 764    insufficient space in CODING->cmp_data, return 1.
 765    Otherwise, return consumed bytes in the source.
 766
 767 */
 768 static INLINE int
 769 decode_composition_emacs_mule (coding, src, src_end,
 770                                destination, dst_end, dst_bytes)
 771      struct coding_system *coding;
 772      unsigned char *src, *src_end, **destination, *dst_end;
 773      int dst_bytes;
 774 {
 775   unsigned char *dst = *destination;
 776   int method, data_len, nchars;
 777   unsigned char *src_base = src++;
 778   /* Store components of composition.  */
 779   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 780   int ncomponent;
 781   /* Store multibyte form of characters to be composed.  This is for
 782      Emacs 20 style composition sequence.  */
 783   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 784   unsigned char *bufp = buf;
 785   int c, i, gref, nref;
 786
 787   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 788       >= COMPOSITION_DATA_SIZE)
 789     {
 790       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 791       return -1;
 792     }
 793
 794   ONE_MORE_BYTE (c);
 795   if (c - 0xF0 >= COMPOSITION_RELATIVE
 796            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 797     {
 798       int with_rule;
 799
 800       method = c - 0xF0;
 801       with_rule = (method == COMPOSITION_WITH_RULE
 802                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 803       ONE_MORE_BYTE (c);
 804       data_len = c - 0xA0;
 805       if (data_len < 4
 806           || src_base + data_len > src_end)
 807         return 0;
 808       ONE_MORE_BYTE (c);
 809       nchars = c - 0xA0;
 810       if (c < 1)
 811         return 0;
 812       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 813         {
 814           /* If it is longer than this, it can't be valid.  */
 815           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 816             return 0;
 817
 818           if (ncomponent % 2 && with_rule)
 819             {
 820               ONE_MORE_BYTE (gref);
 821               gref -= 32;
 822               ONE_MORE_BYTE (nref);
 823               nref -= 32;
 824               c = COMPOSITION_ENCODE_RULE (gref, nref);
 825             }
 826           else
 827             {
 828               int bytes;
 829               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 830                 c = STRING_CHAR (src, bytes);
 831               else
 832                 c = *src, bytes = 1;
 833               src += bytes;
 834             }
 835           component[ncomponent] = c;
 836         }
 837     }
 838   else
 839     {
 840       /* This may be an old Emacs 20 style format.  See the comment at
 841          the section 2 of this file.  */
 842       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 843       if (src == src_end
 844           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 845         goto label_end_of_loop;
 846
 847       src_end = src;
 848       src = src_base + 1;
 849       if (c < 0xC0)
 850         {
 851           method = COMPOSITION_RELATIVE;
 852           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 853             {
 854               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 855               if (c < 0)
 856                 break;
 857               component[ncomponent++] = c;
 858             }
 859           if (ncomponent < 2)
 860             return 0;
 861           nchars = ncomponent;
 862         }
 863       else if (c == 0xFF)
 864         {
 865           method = COMPOSITION_WITH_RULE;
 866           src++;
 867           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 868           if (c < 0)
 869             return 0;
 870           component[0] = c;
 871           for (ncomponent = 1;
 872                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 873             {
 874               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 875               if (c < 0)
 876                 break;
 877               component[ncomponent++] = c;
 878               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 879               if (c < 0)
 880                 break;
 881               component[ncomponent++] = c;
 882             }
 883           if (ncomponent < 3)
 884             return 0;
 885           nchars = (ncomponent + 1) / 2;
 886         }
 887       else
 888         return 0;
 889     }
 890
 891   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 892     {
 893       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 894       for (i = 0; i < ncomponent; i++)
 895         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 896       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 897       if (buf < bufp)
 898         {
 899           unsigned char *p = buf;
 900           EMIT_BYTES (p, bufp);
 901           *destination += bufp - buf;
 902           coding->produced_char += nchars;
 903         }
 904       return (src - src_base);
 905     }
 906  label_end_of_loop:
 907   return -1;
 908 }
 909
 910 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 911
 912 static void
 913 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 914      struct coding_system *coding;
 915      unsigned char *source, *destination;
 916      int src_bytes, dst_bytes;
 917 {
 918   unsigned char *src = source;
 919   unsigned char *src_end = source + src_bytes;
 920   unsigned char *dst = destination;
 921   unsigned char *dst_end = destination + dst_bytes;
 922   /* SRC_BASE remembers the start position in source in each loop.
 923      The loop will be exited when there's not enough source code, or
 924      when there's not enough destination area to produce a
 925      character.  */
 926   unsigned char *src_base;
 927
 928   coding->produced_char = 0;
 929   while ((src_base = src) < src_end)
 930     {
 931       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 932       int bytes;
 933
 934       if (*src == '\r')
 935         {
 936           int c = *src++;
 937
 938           if (coding->eol_type == CODING_EOL_CR)
 939             c = '\n';
 940           else if (coding->eol_type == CODING_EOL_CRLF)
 941             {
 942               ONE_MORE_BYTE (c);
 943               if (c != '\n')
 944                 {
 945                   src--;
 946                   c = '\r';
 947                 }
 948             }
 949           *dst++ = c;
 950           coding->produced_char++;
 951           continue;
 952         }
 953       else if (*src == '\n')
 954         {
 955           if ((coding->eol_type == CODING_EOL_CR
 956                || coding->eol_type == CODING_EOL_CRLF)
 957               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 958             {
 959               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 960               goto label_end_of_loop;
 961             }
 962           *dst++ = *src++;
 963           coding->produced_char++;
 964           continue;
 965         }
 966       else if (*src == 0x80 && coding->cmp_data)
 967         {
 968           /* Start of composition data.  */
 969           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
 970                                                          &dst, dst_end,
 971                                                          dst_bytes);
 972           if (consumed < 0)
 973             goto label_end_of_loop;
 974           else if (consumed > 0)
 975             {
 976               src += consumed;
 977               continue;
 978             }
 979           bytes = CHAR_STRING (*src, tmp);
 980           p = tmp;
 981           src++;
 982         }
 983       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 984         {
 985           p = src;
 986           src += bytes;
 987         }
 988       else
 989         {
 990           bytes = CHAR_STRING (*src, tmp);
 991           p = tmp;
 992           src++;
 993         }
 994       if (dst + bytes >= (dst_bytes ? dst_end : src))
 995         {
 996           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 997           break;
 998         }
 999       while (bytes--) *dst++ = *p++;
1000       coding->produced_char++;
1001     }
1002  label_end_of_loop:
1003   coding->consumed = coding->consumed_char = src_base - source;
1004   coding->produced = dst - destination;
1005 }
1006
1007
1008 /* Encode composition data stored at DATA into a special byte sequence
1009    starting by 0x80.  Update CODING->cmp_data_start and maybe
1010    CODING->cmp_data for the next call.  */
1011
1012 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1013   do {                                                                  \
1014     unsigned char buf[1024], *p0 = buf, *p;                             \
1015     int len = data[0];                                                  \
1016     int i;                                                              \
1017                                                                         \
1018     buf[0] = 0x80;                                                      \
1019     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1020     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1021     p = buf + 4;                                                        \
1022     if (data[3] == COMPOSITION_WITH_RULE                                \
1023         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1024       {                                                                 \
1025         p += CHAR_STRING (data[4], p);                                  \
1026         for (i = 5; i < len; i += 2)                                    \
1027           {                                                             \
1028             int gref, nref;                                             \
1029              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1030             *p++ = 0x20 + gref;                                         \
1031             *p++ = 0x20 + nref;                                         \
1032             p += CHAR_STRING (data[i + 1], p);                          \
1033           }                                                             \
1034       }                                                                 \
1035     else                                                                \
1036       {                                                                 \
1037         for (i = 4; i < len; i++)                                       \
1038           p += CHAR_STRING (data[i], p);                                \
1039       }                                                                 \
1040     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1041                                                                         \
1042     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1043       {                                                                 \
1044         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1045         goto label_end_of_loop;                                         \
1046       }                                                                 \
1047     while (p0 < p)                                                      \
1048       *dst++ = *p0++;                                                   \
1049     coding->cmp_data_start += data[0];                                  \
1050     if (coding->cmp_data_start == coding->cmp_data->used                \
1051         && coding->cmp_data->next)                                      \
1052       {                                                                 \
1053         coding->cmp_data = coding->cmp_data->next;                      \
1054         coding->cmp_data_start = 0;                                     \
1055       }                                                                 \
1056   } while (0)
1057
1058
1059 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1060                             unsigned char *, int, int));
1061
1062 static void
1063 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1064      struct coding_system *coding;
1065      unsigned char *source, *destination;
1066      int src_bytes, dst_bytes;
1067 {
1068   unsigned char *src = source;
1069   unsigned char *src_end = source + src_bytes;
1070   unsigned char *dst = destination;
1071   unsigned char *dst_end = destination + dst_bytes;
1072   unsigned char *src_base;
1073   int c;
1074   int char_offset;
1075   int *data;
1076
1077   Lisp_Object translation_table;
1078
1079   translation_table = Qnil;
1080
1081   /* Optimization for the case that there's no composition.  */
1082   if (!coding->cmp_data || coding->cmp_data->used == 0)
1083     {
1084       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1085       return;
1086     }
1087
1088   char_offset = coding->cmp_data->char_offset;
1089   data = coding->cmp_data->data + coding->cmp_data_start;
1090   while (1)
1091     {
1092       src_base = src;
1093
1094       /* If SRC starts a composition, encode the information about the
1095          composition in advance.  */
1096       if (coding->cmp_data_start < coding->cmp_data->used
1097           && char_offset + coding->consumed_char == data[1])
1098         {
1099           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1100           char_offset = coding->cmp_data->char_offset;
1101           data = coding->cmp_data->data + coding->cmp_data_start;
1102         }
1103
1104       ONE_MORE_CHAR (c);
1105       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1106                         || coding->eol_type == CODING_EOL_CR))
1107         {
1108           if (coding->eol_type == CODING_EOL_CRLF)
1109             EMIT_TWO_BYTES ('\r', c);
1110           else
1111             EMIT_ONE_BYTE ('\r');
1112         }
1113       else if (SINGLE_BYTE_CHAR_P (c))
1114         EMIT_ONE_BYTE (c);
1115       else
1116         EMIT_BYTES (src_base, src);
1117       coding->consumed_char++;
1118     }
1119  label_end_of_loop:
1120   coding->consumed = src_base - source;
1121   coding->produced = coding->produced_char = dst - destination;
1122   return;
1123 }
1124
1125 \f
1126 /*** 3. ISO2022 handlers ***/
1127
1128 /* The following note describes the coding system ISO2022 briefly.
1129    Since the intention of this note is to help understand the
1130    functions in this file, some parts are NOT ACCURATE or are OVERLY
1131    SIMPLIFIED.  For thorough understanding, please refer to the
1132    original document of ISO2022.  This is equivalent to the standard
1133    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1134
1135    ISO2022 provides many mechanisms to encode several character sets
1136    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1137    is encoded using bytes less than 128.  This may make the encoded
1138    text a little bit longer, but the text passes more easily through
1139    several types of gateway, some of which strip off the MSB (Most
1140    Significant Bit).
1141
1142    There are two kinds of character sets: control character sets and
1143    graphic character sets.  The former contain control characters such
1144    as `newline' and `escape' to provide control functions (control
1145    functions are also provided by escape sequences).  The latter
1146    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1147    two control character sets and many graphic character sets.
1148
1149    Graphic character sets are classified into one of the following
1150    four classes, according to the number of bytes (DIMENSION) and
1151    number of characters in one dimension (CHARS) of the set:
1152    - DIMENSION1_CHARS94
1153    - DIMENSION1_CHARS96
1154    - DIMENSION2_CHARS94
1155    - DIMENSION2_CHARS96
1156
1157    In addition, each character set is assigned an identification tag,
1158    unique for each set, called the "final character" (denoted as <F>
1159    hereafter).  The <F> of each character set is decided by ECMA(*)
1160    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1161    (0x30..0x3F are for private use only).
1162
1163    Note (*): ECMA = European Computer Manufacturers Association
1164
1165    Here are examples of graphic character sets [NAME(<F>)]:
1166         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1167         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1168         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1169         o DIMENSION2_CHARS96 -- none for the moment
1170
1171    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1172         C0 [0x00..0x1F] -- control character plane 0
1173         GL [0x20..0x7F] -- graphic character plane 0
1174         C1 [0x80..0x9F] -- control character plane 1
1175         GR [0xA0..0xFF] -- graphic character plane 1
1176
1177    A control character set is directly designated and invoked to C0 or
1178    C1 by an escape sequence.  The most common case is that:
1179    - ISO646's  control character set is designated/invoked to C0, and
1180    - ISO6429's control character set is designated/invoked to C1,
1181    and usually these designations/invocations are omitted in encoded
1182    text.  In a 7-bit environment, only C0 can be used, and a control
1183    character for C1 is encoded by an appropriate escape sequence to
1184    fit into the environment.  All control characters for C1 are
1185    defined to have corresponding escape sequences.
1186
1187    A graphic character set is at first designated to one of four
1188    graphic registers (G0 through G3), then these graphic registers are
1189    invoked to GL or GR.  These designations and invocations can be
1190    done independently.  The most common case is that G0 is invoked to
1191    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1192    these invocations and designations are omitted in encoded text.
1193    In a 7-bit environment, only GL can be used.
1194
1195    When a graphic character set of CHARS94 is invoked to GL, codes
1196    0x20 and 0x7F of the GL area work as control characters SPACE and
1197    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1198    be used.
1199
1200    There are two ways of invocation: locking-shift and single-shift.
1201    With locking-shift, the invocation lasts until the next different
1202    invocation, whereas with single-shift, the invocation affects the
1203    following character only and doesn't affect the locking-shift
1204    state.  Invocations are done by the following control characters or
1205    escape sequences:
1206
1207    ----------------------------------------------------------------------
1208    abbrev  function                  cntrl escape seq   description
1209    ----------------------------------------------------------------------
1210    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1211    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1212    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1213    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1214    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1215    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1216    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1217    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1218    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1219    ----------------------------------------------------------------------
1220    (*) These are not used by any known coding system.
1221
1222    Control characters for these functions are defined by macros
1223    ISO_CODE_XXX in `coding.h'.
1224
1225    Designations are done by the following escape sequences:
1226    ----------------------------------------------------------------------
1227    escape sequence      description
1228    ----------------------------------------------------------------------
1229    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1230    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1231    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1232    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1233    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1234    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1235    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1236    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1237    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1238    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1239    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1240    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1241    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1242    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1243    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1244    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1245    ----------------------------------------------------------------------
1246
1247    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1248    of dimension 1, chars 94, and final character <F>, etc...
1249
1250    Note (*): Although these designations are not allowed in ISO2022,
1251    Emacs accepts them on decoding, and produces them on encoding
1252    CHARS96 character sets in a coding system which is characterized as
1253    7-bit environment, non-locking-shift, and non-single-shift.
1254
1255    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1256    '(' can be omitted.  We refer to this as "short-form" hereafter.
1257
1258    Now you may notice that there are a lot of ways of encoding the
1259    same multilingual text in ISO2022.  Actually, there exist many
1260    coding systems such as Compound Text (used in X11's inter client
1261    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1262    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1263    localized platforms), and all of these are variants of ISO2022.
1264
1265    In addition to the above, Emacs handles two more kinds of escape
1266    sequences: ISO6429's direction specification and Emacs' private
1267    sequence for specifying character composition.
1268
1269    ISO6429's direction specification takes the following form:
1270         o CSI ']'      -- end of the current direction
1271         o CSI '0' ']'  -- end of the current direction
1272         o CSI '1' ']'  -- start of left-to-right text
1273         o CSI '2' ']'  -- start of right-to-left text
1274    The control character CSI (0x9B: control sequence introducer) is
1275    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1276
1277    Character composition specification takes the following form:
1278         o ESC '0' -- start relative composition
1279         o ESC '1' -- end composition
1280         o ESC '2' -- start rule-base composition (*)
1281         o ESC '3' -- start relative composition with alternate chars  (**)
1282         o ESC '4' -- start rule-base composition with alternate chars  (**)
1283   Since these are not standard escape sequences of any ISO standard,
1284   the use of them with these meanings is restricted to Emacs only.
1285
1286   (*) This form is used only in Emacs 20.5 and older versions,
1287   but the newer versions can safely decode it.
1288   (**) This form is used only in Emacs 21.1 and newer versions,
1289   and the older versions can't decode it.
1290
1291   Here's a list of example usages of these composition escape
1292   sequences (categorized by `enum composition_method').
1293
1294   COMPOSITION_RELATIVE:
1295         ESC 0 CHAR [ CHAR ] ESC 1
1296   COMPOSITION_WITH_RULE:
1297         ESC 2 CHAR [ RULE CHAR ] ESC 1
1298   COMPOSITION_WITH_ALTCHARS:
1299         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1300   COMPOSITION_WITH_RULE_ALTCHARS:
1301         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1302
1303 enum iso_code_class_type iso_code_class[256];
1304
1305 #define CHARSET_OK(idx, charset, c)                                     \
1306   (coding_system_table[idx]                                             \
1307    && (charset == CHARSET_ASCII                                         \
1308        || (safe_chars = coding_safe_chars (coding_system_table[idx]),   \
1309            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1310    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1311                                               charset)                  \
1312        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1313
1314 #define SHIFT_OUT_OK(idx) \
1315   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1316
1317 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1318    Check if a text is encoded in ISO2022.  If it is, return an
1319    integer in which appropriate flag bits any of:
1320         CODING_CATEGORY_MASK_ISO_7
1321         CODING_CATEGORY_MASK_ISO_7_TIGHT
1322         CODING_CATEGORY_MASK_ISO_8_1
1323         CODING_CATEGORY_MASK_ISO_8_2
1324         CODING_CATEGORY_MASK_ISO_7_ELSE
1325         CODING_CATEGORY_MASK_ISO_8_ELSE
1326    are set.  If a code which should never appear in ISO2022 is found,
1327    returns 0.  */
1328
1329 static int
1330 detect_coding_iso2022 (src, src_end, multibytep)
1331      unsigned char *src, *src_end;
1332      int multibytep;
1333 {
1334   int mask = CODING_CATEGORY_MASK_ISO;
1335   int mask_found = 0;
1336   int reg[4], shift_out = 0, single_shifting = 0;
1337   int c, c1, charset;
1338   /* Dummy for ONE_MORE_BYTE.  */
1339   struct coding_system dummy_coding;
1340   struct coding_system *coding = &dummy_coding;
1341   Lisp_Object safe_chars;
1342
1343   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1344   while (mask && src < src_end)
1345     {
1346       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1347     retry:
1348       switch (c)
1349         {
1350         case ISO_CODE_ESC:
1351           if (inhibit_iso_escape_detection)
1352             break;
1353           single_shifting = 0;
1354           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1355           if (c >= '(' && c <= '/')
1356             {
1357               /* Designation sequence for a charset of dimension 1.  */
1358               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1359               if (c1 < ' ' || c1 >= 0x80
1360                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1361                 /* Invalid designation sequence.  Just ignore.  */
1362                 break;
1363               reg[(c - '(') % 4] = charset;
1364             }
1365           else if (c == '$')
1366             {
1367               /* Designation sequence for a charset of dimension 2.  */
1368               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1369               if (c >= '@' && c <= 'B')
1370                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1371                 reg[0] = charset = iso_charset_table[1][0][c];
1372               else if (c >= '(' && c <= '/')
1373                 {
1374                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1375                   if (c1 < ' ' || c1 >= 0x80
1376                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1377                     /* Invalid designation sequence.  Just ignore.  */
1378                     break;
1379                   reg[(c - '(') % 4] = charset;
1380                 }
1381               else
1382                 /* Invalid designation sequence.  Just ignore.  */
1383                 break;
1384             }
1385           else if (c == 'N' || c == 'O')
1386             {
1387               /* ESC <Fe> for SS2 or SS3.  */
1388               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1389               break;
1390             }
1391           else if (c >= '0' && c <= '4')
1392             {
1393               /* ESC <Fp> for start/end composition.  */
1394               mask_found |= CODING_CATEGORY_MASK_ISO;
1395               break;
1396             }
1397           else
1398             /* Invalid escape sequence.  Just ignore.  */
1399             break;
1400
1401           /* We found a valid designation sequence for CHARSET.  */
1402           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1403           c = MAKE_CHAR (charset, 0, 0);
1404           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1405             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1406           else
1407             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1408           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1409             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1410           else
1411             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1412           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1413             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1414           else
1415             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1416           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1417             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1418           else
1419             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1420           break;
1421
1422         case ISO_CODE_SO:
1423           if (inhibit_iso_escape_detection)
1424             break;
1425           single_shifting = 0;
1426           if (shift_out == 0
1427               && (reg[1] >= 0
1428                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1429                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1430             {
1431               /* Locking shift out.  */
1432               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1433               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1434             }
1435           break;
1436
1437         case ISO_CODE_SI:
1438           if (inhibit_iso_escape_detection)
1439             break;
1440           single_shifting = 0;
1441           if (shift_out == 1)
1442             {
1443               /* Locking shift in.  */
1444               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1445               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1446             }
1447           break;
1448
1449         case ISO_CODE_CSI:
1450           single_shifting = 0;
1451         case ISO_CODE_SS2:
1452         case ISO_CODE_SS3:
1453           {
1454             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1455
1456             if (inhibit_iso_escape_detection)
1457               break;
1458             if (c != ISO_CODE_CSI)
1459               {
1460                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1461                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1462                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1463                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1464                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1465                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1466                 single_shifting = 1;
1467               }
1468             if (VECTORP (Vlatin_extra_code_table)
1469                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1470               {
1471                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1472                     & CODING_FLAG_ISO_LATIN_EXTRA)
1473                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1474                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1475                     & CODING_FLAG_ISO_LATIN_EXTRA)
1476                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1477               }
1478             mask &= newmask;
1479             mask_found |= newmask;
1480           }
1481           break;
1482
1483         default:
1484           if (c < 0x80)
1485             {
1486               single_shifting = 0;
1487               break;
1488             }
1489           else if (c < 0xA0)
1490             {
1491               single_shifting = 0;
1492               if (VECTORP (Vlatin_extra_code_table)
1493                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1494                 {
1495                   int newmask = 0;
1496
1497                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1498                       & CODING_FLAG_ISO_LATIN_EXTRA)
1499                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1500                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1501                       & CODING_FLAG_ISO_LATIN_EXTRA)
1502                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1503                   mask &= newmask;
1504                   mask_found |= newmask;
1505                 }
1506               else
1507                 return 0;
1508             }
1509           else
1510             {
1511               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1512                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1513               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1514               /* Check the length of succeeding codes of the range
1515                  0xA0..0FF.  If the byte length is odd, we exclude
1516                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1517                  when we are not single shifting.  */
1518               if (!single_shifting
1519                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1520                 {
1521                   int i = 1;
1522
1523                   c = -1;
1524                   while (src < src_end)
1525                     {
1526                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1527                       if (c < 0xA0)
1528                         break;
1529                       i++;
1530                     }
1531
1532                   if (i & 1 && src < src_end)
1533                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1534                   else
1535                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1536                   if (c >= 0)
1537                     /* This means that we have read one extra byte.  */
1538                     goto retry;
1539                 }
1540             }
1541           break;
1542         }
1543     }
1544  label_end_of_loop:
1545   return (mask & mask_found);
1546 }
1547
1548 /* Decode a character of which charset is CHARSET, the 1st position
1549    code is C1, the 2nd position code is C2, and return the decoded
1550    character code.  If the variable `translation_table' is non-nil,
1551    returned the translated code.  */
1552
1553 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1554   (NILP (translation_table)                     \
1555    ? MAKE_CHAR (charset, c1, c2)                \
1556    : translate_char (translation_table, -1, charset, c1, c2))
1557
1558 /* Set designation state into CODING.  */
1559 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1560   do {                                                                     \
1561     int charset, c;                                                        \
1562                                                                            \
1563     if (final_char < '0' || final_char >= 128)                             \
1564       goto label_invalid_code;                                             \
1565     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1566                                  make_number (chars),                      \
1567                                  make_number (final_char));                \
1568     c = MAKE_CHAR (charset, 0, 0);                                         \
1569     if (charset >= 0                                                       \
1570         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1571             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1572       {                                                                    \
1573         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1574             && reg == 0                                                    \
1575             && charset == CHARSET_ASCII)                                   \
1576           {                                                                \
1577             /* We should insert this designation sequence as is so         \
1578                that it is surely written back to a file.  */               \
1579             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1580             goto label_invalid_code;                                       \
1581           }                                                                \
1582         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1583         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1584             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1585           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1586         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1587       }                                                                    \
1588     else                                                                   \
1589       {                                                                    \
1590         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1591         goto label_invalid_code;                                           \
1592       }                                                                    \
1593   } while (0)
1594
1595 /* Allocate a memory block for storing information about compositions.
1596    The block is chained to the already allocated blocks.  */
1597
1598 void
1599 coding_allocate_composition_data (coding, char_offset)
1600      struct coding_system *coding;
1601      int char_offset;
1602 {
1603   struct composition_data *cmp_data
1604     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1605
1606   cmp_data->char_offset = char_offset;
1607   cmp_data->used = 0;
1608   cmp_data->prev = coding->cmp_data;
1609   cmp_data->next = NULL;
1610   if (coding->cmp_data)
1611     coding->cmp_data->next = cmp_data;
1612   coding->cmp_data = cmp_data;
1613   coding->cmp_data_start = 0;
1614 }
1615
1616 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1617    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1618    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1619    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1620    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1621   */
1622
1623 #define DECODE_COMPOSITION_START(c1)                                       \
1624   do {                                                                     \
1625     if (coding->composing == COMPOSITION_DISABLED)                         \
1626       {                                                                    \
1627         *dst++ = ISO_CODE_ESC;                                             \
1628         *dst++ = c1 & 0x7f;                                                \
1629         coding->produced_char += 2;                                        \
1630       }                                                                    \
1631     else if (!COMPOSING_P (coding))                                        \
1632       {                                                                    \
1633         /* This is surely the start of a composition.  We must be sure     \
1634            that coding->cmp_data has enough space to store the             \
1635            information about the composition.  If not, terminate the       \
1636            current decoding loop, allocate one more memory block for       \
1637            coding->cmp_data in the caller, then start the decoding         \
1638            loop again.  We can't allocate memory here directly because     \
1639            it may cause buffer/string relocation.  */                      \
1640         if (!coding->cmp_data                                              \
1641             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1642                 >= COMPOSITION_DATA_SIZE))                                 \
1643           {                                                                \
1644             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1645             goto label_end_of_loop;                                        \
1646           }                                                                \
1647         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1648                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1649                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1650                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1651         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1652                                       coding->composing);                  \
1653         coding->composition_rule_follows = 0;                              \
1654       }                                                                    \
1655     else                                                                   \
1656       {                                                                    \
1657         /* We are already handling a composition.  If the method is        \
1658            the following two, the codes following the current escape       \
1659            sequence are actual characters stored in a buffer.  */          \
1660         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1661             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1662           {                                                                \
1663             coding->composing = COMPOSITION_RELATIVE;                      \
1664             coding->composition_rule_follows = 0;                          \
1665           }                                                                \
1666       }                                                                    \
1667   } while (0)
1668
1669 /* Handle composition end sequence ESC 1.  */
1670
1671 #define DECODE_COMPOSITION_END(c1)                                      \
1672   do {                                                                  \
1673     if (! COMPOSING_P (coding))                                         \
1674       {                                                                 \
1675         *dst++ = ISO_CODE_ESC;                                          \
1676         *dst++ = c1;                                                    \
1677         coding->produced_char += 2;                                     \
1678       }                                                                 \
1679     else                                                                \
1680       {                                                                 \
1681         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1682         coding->composing = COMPOSITION_NO;                             \
1683       }                                                                 \
1684   } while (0)
1685
1686 /* Decode a composition rule from the byte C1 (and maybe one more byte
1687    from SRC) and store one encoded composition rule in
1688    coding->cmp_data.  */
1689
1690 #define DECODE_COMPOSITION_RULE(c1)                                     \
1691   do {                                                                  \
1692     int rule = 0;                                                       \
1693     (c1) -= 32;                                                         \
1694     if (c1 < 81)                /* old format (before ver.21) */        \
1695       {                                                                 \
1696         int gref = (c1) / 9;                                            \
1697         int nref = (c1) % 9;                                            \
1698         if (gref == 4) gref = 10;                                       \
1699         if (nref == 4) nref = 10;                                       \
1700         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1701       }                                                                 \
1702     else if (c1 < 93)           /* new format (after ver.21) */         \
1703       {                                                                 \
1704         ONE_MORE_BYTE (c2);                                             \
1705         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1706       }                                                                 \
1707     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1708     coding->composition_rule_follows = 0;                               \
1709   } while (0)
1710
1711
1712 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1713
1714 static void
1715 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1716      struct coding_system *coding;
1717      unsigned char *source, *destination;
1718      int src_bytes, dst_bytes;
1719 {
1720   unsigned char *src = source;
1721   unsigned char *src_end = source + src_bytes;
1722   unsigned char *dst = destination;
1723   unsigned char *dst_end = destination + dst_bytes;
1724   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1725   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1726   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1727   /* SRC_BASE remembers the start position in source in each loop.
1728      The loop will be exited when there's not enough source code
1729      (within macro ONE_MORE_BYTE), or when there's not enough
1730      destination area to produce a character (within macro
1731      EMIT_CHAR).  */
1732   unsigned char *src_base;
1733   int c, charset;
1734   Lisp_Object translation_table;
1735   Lisp_Object safe_chars;
1736
1737   safe_chars = coding_safe_chars (coding);
1738
1739   if (NILP (Venable_character_translation))
1740     translation_table = Qnil;
1741   else
1742     {
1743       translation_table = coding->translation_table_for_decode;
1744       if (NILP (translation_table))
1745         translation_table = Vstandard_translation_table_for_decode;
1746     }
1747
1748   coding->result = CODING_FINISH_NORMAL;
1749
1750   while (1)
1751     {
1752       int c1, c2;
1753
1754       src_base = src;
1755       ONE_MORE_BYTE (c1);
1756
1757       /* We produce no character or one character.  */
1758       switch (iso_code_class [c1])
1759         {
1760         case ISO_0x20_or_0x7F:
1761           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1762             {
1763               DECODE_COMPOSITION_RULE (c1);
1764               continue;
1765             }
1766           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1767             {
1768               /* This is SPACE or DEL.  */
1769               charset = CHARSET_ASCII;
1770               break;
1771             }
1772           /* This is a graphic character, we fall down ...  */
1773
1774         case ISO_graphic_plane_0:
1775           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1776             {
1777               DECODE_COMPOSITION_RULE (c1);
1778               continue;
1779             }
1780           charset = charset0;
1781           break;
1782
1783         case ISO_0xA0_or_0xFF:
1784           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1785               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1786             goto label_invalid_code;
1787           /* This is a graphic character, we fall down ... */
1788
1789         case ISO_graphic_plane_1:
1790           if (charset1 < 0)
1791             goto label_invalid_code;
1792           charset = charset1;
1793           break;
1794
1795         case ISO_control_0:
1796           if (COMPOSING_P (coding))
1797             DECODE_COMPOSITION_END ('1');
1798
1799           /* All ISO2022 control characters in this class have the
1800              same representation in Emacs internal format.  */
1801           if (c1 == '\n'
1802               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1803               && (coding->eol_type == CODING_EOL_CR
1804                   || coding->eol_type == CODING_EOL_CRLF))
1805             {
1806               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1807               goto label_end_of_loop;
1808             }
1809           charset = CHARSET_ASCII;
1810           break;
1811
1812         case ISO_control_1:
1813           if (COMPOSING_P (coding))
1814             DECODE_COMPOSITION_END ('1');
1815           goto label_invalid_code;
1816
1817         case ISO_carriage_return:
1818           if (COMPOSING_P (coding))
1819             DECODE_COMPOSITION_END ('1');
1820
1821           if (coding->eol_type == CODING_EOL_CR)
1822             c1 = '\n';
1823           else if (coding->eol_type == CODING_EOL_CRLF)
1824             {
1825               ONE_MORE_BYTE (c1);
1826               if (c1 != ISO_CODE_LF)
1827                 {
1828                   src--;
1829                   c1 = '\r';
1830                 }
1831             }
1832           charset = CHARSET_ASCII;
1833           break;
1834
1835         case ISO_shift_out:
1836           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1837               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1838             goto label_invalid_code;
1839           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1840           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1841           continue;
1842
1843         case ISO_shift_in:
1844           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1845             goto label_invalid_code;
1846           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1847           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1848           continue;
1849
1850         case ISO_single_shift_2_7:
1851         case ISO_single_shift_2:
1852           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1853             goto label_invalid_code;
1854           /* SS2 is handled as an escape sequence of ESC 'N' */
1855           c1 = 'N';
1856           goto label_escape_sequence;
1857
1858         case ISO_single_shift_3:
1859           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1860             goto label_invalid_code;
1861           /* SS2 is handled as an escape sequence of ESC 'O' */
1862           c1 = 'O';
1863           goto label_escape_sequence;
1864
1865         case ISO_control_sequence_introducer:
1866           /* CSI is handled as an escape sequence of ESC '[' ...  */
1867           c1 = '[';
1868           goto label_escape_sequence;
1869
1870         case ISO_escape:
1871           ONE_MORE_BYTE (c1);
1872         label_escape_sequence:
1873           /* Escape sequences handled by Emacs are invocation,
1874              designation, direction specification, and character
1875              composition specification.  */
1876           switch (c1)
1877             {
1878             case '&':           /* revision of following character set */
1879               ONE_MORE_BYTE (c1);
1880               if (!(c1 >= '@' && c1 <= '~'))
1881                 goto label_invalid_code;
1882               ONE_MORE_BYTE (c1);
1883               if (c1 != ISO_CODE_ESC)
1884                 goto label_invalid_code;
1885               ONE_MORE_BYTE (c1);
1886               goto label_escape_sequence;
1887
1888             case '$':           /* designation of 2-byte character set */
1889               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1890                 goto label_invalid_code;
1891               ONE_MORE_BYTE (c1);
1892               if (c1 >= '@' && c1 <= 'B')
1893                 {       /* designation of JISX0208.1978, GB2312.1980,
1894                            or JISX0208.1980 */
1895                   DECODE_DESIGNATION (0, 2, 94, c1);
1896                 }
1897               else if (c1 >= 0x28 && c1 <= 0x2B)
1898                 {       /* designation of DIMENSION2_CHARS94 character set */
1899                   ONE_MORE_BYTE (c2);
1900                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1901                 }
1902               else if (c1 >= 0x2C && c1 <= 0x2F)
1903                 {       /* designation of DIMENSION2_CHARS96 character set */
1904                   ONE_MORE_BYTE (c2);
1905                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1906                 }
1907               else
1908                 goto label_invalid_code;
1909               /* We must update these variables now.  */
1910               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1911               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1912               continue;
1913
1914             case 'n':           /* invocation of locking-shift-2 */
1915               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1916                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1917                 goto label_invalid_code;
1918               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1919               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1920               continue;
1921
1922             case 'o':           /* invocation of locking-shift-3 */
1923               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1924                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1925                 goto label_invalid_code;
1926               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1927               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1928               continue;
1929
1930             case 'N':           /* invocation of single-shift-2 */
1931               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1932                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1933                 goto label_invalid_code;
1934               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1935               ONE_MORE_BYTE (c1);
1936               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1937                 goto label_invalid_code;
1938               break;
1939
1940             case 'O':           /* invocation of single-shift-3 */
1941               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1942                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1943                 goto label_invalid_code;
1944               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1945               ONE_MORE_BYTE (c1);
1946               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1947                 goto label_invalid_code;
1948               break;
1949
1950             case '0': case '2': case '3': case '4': /* start composition */
1951               DECODE_COMPOSITION_START (c1);
1952               continue;
1953
1954             case '1':           /* end composition */
1955               DECODE_COMPOSITION_END (c1);
1956               continue;
1957
1958             case '[':           /* specification of direction */
1959               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1960                 goto label_invalid_code;
1961               /* For the moment, nested direction is not supported.
1962                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1963                  left-to-right, and nonzero means right-to-left.  */
1964               ONE_MORE_BYTE (c1);
1965               switch (c1)
1966                 {
1967                 case ']':       /* end of the current direction */
1968                   coding->mode &= ~CODING_MODE_DIRECTION;
1969
1970                 case '0':       /* end of the current direction */
1971                 case '1':       /* start of left-to-right direction */
1972                   ONE_MORE_BYTE (c1);
1973                   if (c1 == ']')
1974                     coding->mode &= ~CODING_MODE_DIRECTION;
1975                   else
1976                     goto label_invalid_code;
1977                   break;
1978
1979                 case '2':       /* start of right-to-left direction */
1980                   ONE_MORE_BYTE (c1);
1981                   if (c1 == ']')
1982                     coding->mode |= CODING_MODE_DIRECTION;
1983                   else
1984                     goto label_invalid_code;
1985                   break;
1986
1987                 default:
1988                   goto label_invalid_code;
1989                 }
1990               continue;
1991
1992             default:
1993               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1994                 goto label_invalid_code;
1995               if (c1 >= 0x28 && c1 <= 0x2B)
1996                 {       /* designation of DIMENSION1_CHARS94 character set */
1997                   ONE_MORE_BYTE (c2);
1998                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1999                 }
2000               else if (c1 >= 0x2C && c1 <= 0x2F)
2001                 {       /* designation of DIMENSION1_CHARS96 character set */
2002                   ONE_MORE_BYTE (c2);
2003                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2004                 }
2005               else
2006                 goto label_invalid_code;
2007               /* We must update these variables now.  */
2008               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2009               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2010               continue;
2011             }
2012         }
2013
2014       /* Now we know CHARSET and 1st position code C1 of a character.
2015          Produce a multibyte sequence for that character while getting
2016          2nd position code C2 if necessary.  */
2017       if (CHARSET_DIMENSION (charset) == 2)
2018         {
2019           ONE_MORE_BYTE (c2);
2020           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2021             /* C2 is not in a valid range.  */
2022             goto label_invalid_code;
2023         }
2024       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2025       EMIT_CHAR (c);
2026       continue;
2027
2028     label_invalid_code:
2029       coding->errors++;
2030       if (COMPOSING_P (coding))
2031         DECODE_COMPOSITION_END ('1');
2032       src = src_base;
2033       c = *src++;
2034       EMIT_CHAR (c);
2035     }
2036
2037  label_end_of_loop:
2038   coding->consumed = coding->consumed_char = src_base - source;
2039   coding->produced = dst - destination;
2040   return;
2041 }
2042
2043
2044 /* ISO2022 encoding stuff.  */
2045
2046 /*
2047    It is not enough to say just "ISO2022" on encoding, we have to
2048    specify more details.  In Emacs, each ISO2022 coding system
2049    variant has the following specifications:
2050         1. Initial designation to G0 through G3.
2051         2. Allows short-form designation?
2052         3. ASCII should be designated to G0 before control characters?
2053         4. ASCII should be designated to G0 at end of line?
2054         5. 7-bit environment or 8-bit environment?
2055         6. Use locking-shift?
2056         7. Use Single-shift?
2057    And the following two are only for Japanese:
2058         8. Use ASCII in place of JIS0201-1976-Roman?
2059         9. Use JISX0208-1983 in place of JISX0208-1978?
2060    These specifications are encoded in `coding->flags' as flag bits
2061    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2062    details.
2063 */
2064
2065 /* Produce codes (escape sequence) for designating CHARSET to graphic
2066    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2067    '@', 'A', or 'B' and the coding system CODING allows, produce
2068    designation sequence of short-form.  */
2069
2070 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2071   do {                                                                  \
2072     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2073     char *intermediate_char_94 = "()*+";                                \
2074     char *intermediate_char_96 = ",-./";                                \
2075     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2076                                                                         \
2077     if (revision < 255)                                                 \
2078       {                                                                 \
2079         *dst++ = ISO_CODE_ESC;                                          \
2080         *dst++ = '&';                                                   \
2081         *dst++ = '@' + revision;                                        \
2082       }                                                                 \
2083     *dst++ = ISO_CODE_ESC;                                              \
2084     if (CHARSET_DIMENSION (charset) == 1)                               \
2085       {                                                                 \
2086         if (CHARSET_CHARS (charset) == 94)                              \
2087           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2088         else                                                            \
2089           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2090       }                                                                 \
2091     else                                                                \
2092       {                                                                 \
2093         *dst++ = '$';                                                   \
2094         if (CHARSET_CHARS (charset) == 94)                              \
2095           {                                                             \
2096             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2097                 || reg != 0                                             \
2098                 || final_char < '@' || final_char > 'B')                \
2099               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2100           }                                                             \
2101         else                                                            \
2102           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2103       }                                                                 \
2104     *dst++ = final_char;                                                \
2105     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2106   } while (0)
2107
2108 /* The following two macros produce codes (control character or escape
2109    sequence) for ISO2022 single-shift functions (single-shift-2 and
2110    single-shift-3).  */
2111
2112 #define ENCODE_SINGLE_SHIFT_2                           \
2113   do {                                                  \
2114     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2115       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2116     else                                                \
2117       *dst++ = ISO_CODE_SS2;                            \
2118     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2119   } while (0)
2120
2121 #define ENCODE_SINGLE_SHIFT_3                           \
2122   do {                                                  \
2123     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2124       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2125     else                                                \
2126       *dst++ = ISO_CODE_SS3;                            \
2127     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2128   } while (0)
2129
2130 /* The following four macros produce codes (control character or
2131    escape sequence) for ISO2022 locking-shift functions (shift-in,
2132    shift-out, locking-shift-2, and locking-shift-3).  */
2133
2134 #define ENCODE_SHIFT_IN                         \
2135   do {                                          \
2136     *dst++ = ISO_CODE_SI;                       \
2137     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2138   } while (0)
2139
2140 #define ENCODE_SHIFT_OUT                        \
2141   do {                                          \
2142     *dst++ = ISO_CODE_SO;                       \
2143     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2144   } while (0)
2145
2146 #define ENCODE_LOCKING_SHIFT_2                  \
2147   do {                                          \
2148     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2149     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2150   } while (0)
2151
2152 #define ENCODE_LOCKING_SHIFT_3                  \
2153   do {                                          \
2154     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2155     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2156   } while (0)
2157
2158 /* Produce codes for a DIMENSION1 character whose character set is
2159    CHARSET and whose position-code is C1.  Designation and invocation
2160    sequences are also produced in advance if necessary.  */
2161
2162 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2163   do {                                                                  \
2164     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2165       {                                                                 \
2166         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2167           *dst++ = c1 & 0x7F;                                           \
2168         else                                                            \
2169           *dst++ = c1 | 0x80;                                           \
2170         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2171         break;                                                          \
2172       }                                                                 \
2173     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2174       {                                                                 \
2175         *dst++ = c1 & 0x7F;                                             \
2176         break;                                                          \
2177       }                                                                 \
2178     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2179       {                                                                 \
2180         *dst++ = c1 | 0x80;                                             \
2181         break;                                                          \
2182       }                                                                 \
2183     else                                                                \
2184       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2185          must invoke it, or, at first, designate it to some graphic     \
2186          register.  Then repeat the loop to actually produce the        \
2187          character.  */                                                 \
2188       dst = encode_invocation_designation (charset, coding, dst);       \
2189   } while (1)
2190
2191 /* Produce codes for a DIMENSION2 character whose character set is
2192    CHARSET and whose position-codes are C1 and C2.  Designation and
2193    invocation codes are also produced in advance if necessary.  */
2194
2195 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2196   do {                                                                  \
2197     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2198       {                                                                 \
2199         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2200           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2201         else                                                            \
2202           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2203         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2204         break;                                                          \
2205       }                                                                 \
2206     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2207       {                                                                 \
2208         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2209         break;                                                          \
2210       }                                                                 \
2211     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2212       {                                                                 \
2213         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2214         break;                                                          \
2215       }                                                                 \
2216     else                                                                \
2217       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2218          must invoke it, or, at first, designate it to some graphic     \
2219          register.  Then repeat the loop to actually produce the        \
2220          character.  */                                                 \
2221       dst = encode_invocation_designation (charset, coding, dst);       \
2222   } while (1)
2223
2224 #define ENCODE_ISO_CHARACTER(c)                                 \
2225   do {                                                          \
2226     int charset, c1, c2;                                        \
2227                                                                 \
2228     SPLIT_CHAR (c, charset, c1, c2);                            \
2229     if (CHARSET_DEFINED_P (charset))                            \
2230       {                                                         \
2231         if (CHARSET_DIMENSION (charset) == 1)                   \
2232           {                                                     \
2233             if (charset == CHARSET_ASCII                        \
2234                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2235               charset = charset_latin_jisx0201;                 \
2236             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2237           }                                                     \
2238         else                                                    \
2239           {                                                     \
2240             if (charset == charset_jisx0208                     \
2241                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2242               charset = charset_jisx0208_1978;                  \
2243             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2244           }                                                     \
2245       }                                                         \
2246     else                                                        \
2247       {                                                         \
2248         *dst++ = c1;                                            \
2249         if (c2 >= 0)                                            \
2250           *dst++ = c2;                                          \
2251       }                                                         \
2252   } while (0)
2253
2254
2255 /* Instead of encoding character C, produce one or two `?'s.  */
2256
2257 #define ENCODE_UNSAFE_CHARACTER(c)                                      \
2258   do {                                                                  \
2259     ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);       \
2260     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                           \
2261       ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);     \
2262   } while (0)
2263
2264
2265 /* Produce designation and invocation codes at a place pointed by DST
2266    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2267    Return new DST.  */
2268
2269 unsigned char *
2270 encode_invocation_designation (charset, coding, dst)
2271      int charset;
2272      struct coding_system *coding;
2273      unsigned char *dst;
2274 {
2275   int reg;                      /* graphic register number */
2276
2277   /* At first, check designations.  */
2278   for (reg = 0; reg < 4; reg++)
2279     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2280       break;
2281
2282   if (reg >= 4)
2283     {
2284       /* CHARSET is not yet designated to any graphic registers.  */
2285       /* At first check the requested designation.  */
2286       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2287       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2288         /* Since CHARSET requests no special designation, designate it
2289            to graphic register 0.  */
2290         reg = 0;
2291
2292       ENCODE_DESIGNATION (charset, reg, coding);
2293     }
2294
2295   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2296       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2297     {
2298       /* Since the graphic register REG is not invoked to any graphic
2299          planes, invoke it to graphic plane 0.  */
2300       switch (reg)
2301         {
2302         case 0:                 /* graphic register 0 */
2303           ENCODE_SHIFT_IN;
2304           break;
2305
2306         case 1:                 /* graphic register 1 */
2307           ENCODE_SHIFT_OUT;
2308           break;
2309
2310         case 2:                 /* graphic register 2 */
2311           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2312             ENCODE_SINGLE_SHIFT_2;
2313           else
2314             ENCODE_LOCKING_SHIFT_2;
2315           break;
2316
2317         case 3:                 /* graphic register 3 */
2318           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2319             ENCODE_SINGLE_SHIFT_3;
2320           else
2321             ENCODE_LOCKING_SHIFT_3;
2322           break;
2323         }
2324     }
2325
2326   return dst;
2327 }
2328
2329 /* Produce 2-byte codes for encoded composition rule RULE.  */
2330
2331 #define ENCODE_COMPOSITION_RULE(rule)           \
2332   do {                                          \
2333     int gref, nref;                             \
2334     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2335     *dst++ = 32 + 81 + gref;                    \
2336     *dst++ = 32 + nref;                         \
2337   } while (0)
2338
2339 /* Produce codes for indicating the start of a composition sequence
2340    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2341    which specify information about the composition.  See the comment
2342    in coding.h for the format of DATA.  */
2343
2344 #define ENCODE_COMPOSITION_START(coding, data)                          \
2345   do {                                                                  \
2346     coding->composing = data[3];                                        \
2347     *dst++ = ISO_CODE_ESC;                                              \
2348     if (coding->composing == COMPOSITION_RELATIVE)                      \
2349       *dst++ = '0';                                                     \
2350     else                                                                \
2351       {                                                                 \
2352         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2353                   ? '3' : '4');                                         \
2354         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2355         coding->composition_rule_follows = 0;                           \
2356       }                                                                 \
2357   } while (0)
2358
2359 /* Produce codes for indicating the end of the current composition.  */
2360
2361 #define ENCODE_COMPOSITION_END(coding, data)                    \
2362   do {                                                          \
2363     *dst++ = ISO_CODE_ESC;                                      \
2364     *dst++ = '1';                                               \
2365     coding->cmp_data_start += data[0];                          \
2366     coding->composing = COMPOSITION_NO;                         \
2367     if (coding->cmp_data_start == coding->cmp_data->used        \
2368         && coding->cmp_data->next)                              \
2369       {                                                         \
2370         coding->cmp_data = coding->cmp_data->next;              \
2371         coding->cmp_data_start = 0;                             \
2372       }                                                         \
2373   } while (0)
2374
2375 /* Produce composition start sequence ESC 0.  Here, this sequence
2376    doesn't mean the start of a new composition but means that we have
2377    just produced components (alternate chars and composition rules) of
2378    the composition and the actual text follows in SRC.  */
2379
2380 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2381   do {                                          \
2382     *dst++ = ISO_CODE_ESC;                      \
2383     *dst++ = '0';                               \
2384     coding->composing = COMPOSITION_RELATIVE;   \
2385   } while (0)
2386
2387 /* The following three macros produce codes for indicating direction
2388    of text.  */
2389 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2390   do {                                                  \
2391     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2392       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2393     else                                                \
2394       *dst++ = ISO_CODE_CSI;                            \
2395   } while (0)
2396
2397 #define ENCODE_DIRECTION_R2L    \
2398   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2399
2400 #define ENCODE_DIRECTION_L2R    \
2401   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2402
2403 /* Produce codes for designation and invocation to reset the graphic
2404    planes and registers to initial state.  */
2405 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2406   do {                                                                      \
2407     int reg;                                                                \
2408     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2409       ENCODE_SHIFT_IN;                                                      \
2410     for (reg = 0; reg < 4; reg++)                                           \
2411       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2412           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2413               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2414         ENCODE_DESIGNATION                                                  \
2415           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2416   } while (0)
2417
2418 /* Produce designation sequences of charsets in the line started from
2419    SRC to a place pointed by DST, and return updated DST.
2420
2421    If the current block ends before any end-of-line, we may fail to
2422    find all the necessary designations.  */
2423
2424 static unsigned char *
2425 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2426      struct coding_system *coding;
2427      Lisp_Object translation_table;
2428      unsigned char *src, *src_end, *dst;
2429 {
2430   int charset, c, found = 0, reg;
2431   /* Table of charsets to be designated to each graphic register.  */
2432   int r[4];
2433
2434   for (reg = 0; reg < 4; reg++)
2435     r[reg] = -1;
2436
2437   while (found < 4)
2438     {
2439       ONE_MORE_CHAR (c);
2440       if (c == '\n')
2441         break;
2442
2443       charset = CHAR_CHARSET (c);
2444       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2445       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2446         {
2447           found++;
2448           r[reg] = charset;
2449         }
2450     }
2451
2452  label_end_of_loop:
2453   if (found)
2454     {
2455       for (reg = 0; reg < 4; reg++)
2456         if (r[reg] >= 0
2457             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2458           ENCODE_DESIGNATION (r[reg], reg, coding);
2459     }
2460
2461   return dst;
2462 }
2463
2464 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2465
2466 static void
2467 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2468      struct coding_system *coding;
2469      unsigned char *source, *destination;
2470      int src_bytes, dst_bytes;
2471 {
2472   unsigned char *src = source;
2473   unsigned char *src_end = source + src_bytes;
2474   unsigned char *dst = destination;
2475   unsigned char *dst_end = destination + dst_bytes;
2476   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2477      from DST_END to assure overflow checking is necessary only at the
2478      head of loop.  */
2479   unsigned char *adjusted_dst_end = dst_end - 19;
2480   /* SRC_BASE remembers the start position in source in each loop.
2481      The loop will be exited when there's not enough source text to
2482      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2483      there's not enough destination area to produce encoded codes
2484      (within macro EMIT_BYTES).  */
2485   unsigned char *src_base;
2486   int c;
2487   Lisp_Object translation_table;
2488   Lisp_Object safe_chars;
2489
2490   safe_chars = coding_safe_chars (coding);
2491
2492   if (NILP (Venable_character_translation))
2493     translation_table = Qnil;
2494   else
2495     {
2496       translation_table = coding->translation_table_for_encode;
2497       if (NILP (translation_table))
2498         translation_table = Vstandard_translation_table_for_encode;
2499     }
2500
2501   coding->consumed_char = 0;
2502   coding->errors = 0;
2503   while (1)
2504     {
2505       src_base = src;
2506
2507       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2508         {
2509           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2510           break;
2511         }
2512
2513       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2514           && CODING_SPEC_ISO_BOL (coding))
2515         {
2516           /* We have to produce designation sequences if any now.  */
2517           dst = encode_designation_at_bol (coding, translation_table,
2518                                            src, src_end, dst);
2519           CODING_SPEC_ISO_BOL (coding) = 0;
2520         }
2521
2522       /* Check composition start and end.  */
2523       if (coding->composing != COMPOSITION_DISABLED
2524           && coding->cmp_data_start < coding->cmp_data->used)
2525         {
2526           struct composition_data *cmp_data = coding->cmp_data;
2527           int *data = cmp_data->data + coding->cmp_data_start;
2528           int this_pos = cmp_data->char_offset + coding->consumed_char;
2529
2530           if (coding->composing == COMPOSITION_RELATIVE)
2531             {
2532               if (this_pos == data[2])
2533                 {
2534                   ENCODE_COMPOSITION_END (coding, data);
2535                   cmp_data = coding->cmp_data;
2536                   data = cmp_data->data + coding->cmp_data_start;
2537                 }
2538             }
2539           else if (COMPOSING_P (coding))
2540             {
2541               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2542               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2543                 /* We have consumed components of the composition.
2544                    What follows in SRC is the composition's base
2545                    text.  */
2546                 ENCODE_COMPOSITION_FAKE_START (coding);
2547               else
2548                 {
2549                   int c = cmp_data->data[coding->cmp_data_index++];
2550                   if (coding->composition_rule_follows)
2551                     {
2552                       ENCODE_COMPOSITION_RULE (c);
2553                       coding->composition_rule_follows = 0;
2554                     }
2555                   else
2556                     {
2557                       if (coding->flags & CODING_FLAG_ISO_SAFE
2558                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2559                         ENCODE_UNSAFE_CHARACTER (c);
2560                       else
2561                         ENCODE_ISO_CHARACTER (c);
2562                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2563                         coding->composition_rule_follows = 1;
2564                     }
2565                   continue;
2566                 }
2567             }
2568           if (!COMPOSING_P (coding))
2569             {
2570               if (this_pos == data[1])
2571                 {
2572                   ENCODE_COMPOSITION_START (coding, data);
2573                   continue;
2574                 }
2575             }
2576         }
2577
2578       ONE_MORE_CHAR (c);
2579
2580       /* Now encode the character C.  */
2581       if (c < 0x20 || c == 0x7F)
2582         {
2583           if (c == '\r')
2584             {
2585               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2586                 {
2587                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2588                     ENCODE_RESET_PLANE_AND_REGISTER;
2589                   *dst++ = c;
2590                   continue;
2591                 }
2592               /* fall down to treat '\r' as '\n' ...  */
2593               c = '\n';
2594             }
2595           if (c == '\n')
2596             {
2597               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2598                 ENCODE_RESET_PLANE_AND_REGISTER;
2599               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2600                 bcopy (coding->spec.iso2022.initial_designation,
2601                        coding->spec.iso2022.current_designation,
2602                        sizeof coding->spec.iso2022.initial_designation);
2603               if (coding->eol_type == CODING_EOL_LF
2604                   || coding->eol_type == CODING_EOL_UNDECIDED)
2605                 *dst++ = ISO_CODE_LF;
2606               else if (coding->eol_type == CODING_EOL_CRLF)
2607                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2608               else
2609                 *dst++ = ISO_CODE_CR;
2610               CODING_SPEC_ISO_BOL (coding) = 1;
2611             }
2612           else
2613             {
2614               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2615                 ENCODE_RESET_PLANE_AND_REGISTER;
2616               *dst++ = c;
2617             }
2618         }
2619       else if (ASCII_BYTE_P (c))
2620         ENCODE_ISO_CHARACTER (c);
2621       else if (SINGLE_BYTE_CHAR_P (c))
2622         {
2623           *dst++ = c;
2624           coding->errors++;
2625         }
2626       else if (coding->flags & CODING_FLAG_ISO_SAFE
2627                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2628         ENCODE_UNSAFE_CHARACTER (c);
2629       else
2630         ENCODE_ISO_CHARACTER (c);
2631
2632       coding->consumed_char++;
2633     }
2634
2635  label_end_of_loop:
2636   coding->consumed = src_base - source;
2637   coding->produced = coding->produced_char = dst - destination;
2638 }
2639
2640 \f
2641 /*** 4. SJIS and BIG5 handlers ***/
2642
2643 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2644    quite widely.  So, for the moment, Emacs supports them in the bare
2645    C code.  But, in the future, they may be supported only by CCL.  */
2646
2647 /* SJIS is a coding system encoding three character sets: ASCII, right
2648    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2649    as is.  A character of charset katakana-jisx0201 is encoded by
2650    "position-code + 0x80".  A character of charset japanese-jisx0208
2651    is encoded in 2-byte but two position-codes are divided and shifted
2652    so that it fits in the range below.
2653
2654    --- CODE RANGE of SJIS ---
2655    (character set)      (range)
2656    ASCII                0x00 .. 0x7F
2657    KATAKANA-JISX0201    0xA1 .. 0xDF
2658    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2659             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2660    -------------------------------
2661
2662 */
2663
2664 /* BIG5 is a coding system encoding two character sets: ASCII and
2665    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2666    character set and is encoded in two bytes.
2667
2668    --- CODE RANGE of BIG5 ---
2669    (character set)      (range)
2670    ASCII                0x00 .. 0x7F
2671    Big5 (1st byte)      0xA1 .. 0xFE
2672         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2673    --------------------------
2674
2675    Since the number of characters in Big5 is larger than maximum
2676    characters in Emacs' charset (96x96), it can't be handled as one
2677    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2678    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2679    contains frequently used characters and the latter contains less
2680    frequently used characters.  */
2681
2682 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2683    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2684    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2685    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2686
2687 /* Number of Big5 characters which have the same code in 1st byte.  */
2688 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2689
2690 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2691   do {                                                                  \
2692     unsigned int temp                                                   \
2693       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2694     if (b1 < 0xC9)                                                      \
2695       charset = charset_big5_1;                                         \
2696     else                                                                \
2697       {                                                                 \
2698         charset = charset_big5_2;                                       \
2699         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2700       }                                                                 \
2701     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2702     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2703   } while (0)
2704
2705 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2706   do {                                                                  \
2707     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2708     if (charset == charset_big5_2)                                      \
2709       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2710     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2711     b2 = temp % BIG5_SAME_ROW;                                          \
2712     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2713   } while (0)
2714
2715 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2716    Check if a text is encoded in SJIS.  If it is, return
2717    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2718
2719 static int
2720 detect_coding_sjis (src, src_end, multibytep)
2721      unsigned char *src, *src_end;
2722      int multibytep;
2723 {
2724   int c;
2725   /* Dummy for ONE_MORE_BYTE.  */
2726   struct coding_system dummy_coding;
2727   struct coding_system *coding = &dummy_coding;
2728
2729   while (1)
2730     {
2731       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2732       if (c < 0x80)
2733         continue;
2734       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2735         return 0;
2736       if (c <= 0x9F || c >= 0xE0)
2737         {
2738           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2739           if (c < 0x40 || c == 0x7F || c > 0xFC)
2740             return 0;
2741         }
2742     }
2743  label_end_of_loop:
2744   return CODING_CATEGORY_MASK_SJIS;
2745 }
2746
2747 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2748    Check if a text is encoded in BIG5.  If it is, return
2749    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2750
2751 static int
2752 detect_coding_big5 (src, src_end, multibytep)
2753      unsigned char *src, *src_end;
2754      int multibytep;
2755 {
2756   int c;
2757   /* Dummy for ONE_MORE_BYTE.  */
2758   struct coding_system dummy_coding;
2759   struct coding_system *coding = &dummy_coding;
2760
2761   while (1)
2762     {
2763       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2764       if (c < 0x80)
2765         continue;
2766       if (c < 0xA1 || c > 0xFE)
2767         return 0;
2768       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2769       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2770         return 0;
2771     }
2772  label_end_of_loop:
2773   return CODING_CATEGORY_MASK_BIG5;
2774 }
2775
2776 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2777    Check if a text is encoded in UTF-8.  If it is, return
2778    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2779
2780 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2781 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2782 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2783 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2784 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2785 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2786 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2787
2788 static int
2789 detect_coding_utf_8 (src, src_end, multibytep)
2790      unsigned char *src, *src_end;
2791      int multibytep;
2792 {
2793   unsigned char c;
2794   int seq_maybe_bytes;
2795   /* Dummy for ONE_MORE_BYTE.  */
2796   struct coding_system dummy_coding;
2797   struct coding_system *coding = &dummy_coding;
2798
2799   while (1)
2800     {
2801       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2802       if (UTF_8_1_OCTET_P (c))
2803         continue;
2804       else if (UTF_8_2_OCTET_LEADING_P (c))
2805         seq_maybe_bytes = 1;
2806       else if (UTF_8_3_OCTET_LEADING_P (c))
2807         seq_maybe_bytes = 2;
2808       else if (UTF_8_4_OCTET_LEADING_P (c))
2809         seq_maybe_bytes = 3;
2810       else if (UTF_8_5_OCTET_LEADING_P (c))
2811         seq_maybe_bytes = 4;
2812       else if (UTF_8_6_OCTET_LEADING_P (c))
2813         seq_maybe_bytes = 5;
2814       else
2815         return 0;
2816
2817       do
2818         {
2819           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2820           if (!UTF_8_EXTRA_OCTET_P (c))
2821             return 0;
2822           seq_maybe_bytes--;
2823         }
2824       while (seq_maybe_bytes > 0);
2825     }
2826
2827  label_end_of_loop:
2828   return CODING_CATEGORY_MASK_UTF_8;
2829 }
2830
2831 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2832    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2833    Little Endian (otherwise).  If it is, return
2834    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2835    else return 0.  */
2836
2837 #define UTF_16_INVALID_P(val)   \
2838   (((val) == 0xFFFE)            \
2839    || ((val) == 0xFFFF))
2840
2841 #define UTF_16_HIGH_SURROGATE_P(val) \
2842   (((val) & 0xD800) == 0xD800)
2843
2844 #define UTF_16_LOW_SURROGATE_P(val) \
2845   (((val) & 0xDC00) == 0xDC00)
2846
2847 static int
2848 detect_coding_utf_16 (src, src_end, multibytep)
2849      unsigned char *src, *src_end;
2850      int multibytep;
2851 {
2852   unsigned char c1, c2;
2853   /* Dummy for TWO_MORE_BYTES.  */
2854   struct coding_system dummy_coding;
2855   struct coding_system *coding = &dummy_coding;
2856
2857   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2858   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
2859
2860   if ((c1 == 0xFF) && (c2 == 0xFE))
2861     return CODING_CATEGORY_MASK_UTF_16_LE;
2862   else if ((c1 == 0xFE) && (c2 == 0xFF))
2863     return CODING_CATEGORY_MASK_UTF_16_BE;
2864
2865  label_end_of_loop:
2866   return 0;
2867 }
2868
2869 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2870    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2871
2872 static void
2873 decode_coding_sjis_big5 (coding, source, destination,
2874                          src_bytes, dst_bytes, sjis_p)
2875      struct coding_system *coding;
2876      unsigned char *source, *destination;
2877      int src_bytes, dst_bytes;
2878      int sjis_p;
2879 {
2880   unsigned char *src = source;
2881   unsigned char *src_end = source + src_bytes;
2882   unsigned char *dst = destination;
2883   unsigned char *dst_end = destination + dst_bytes;
2884   /* SRC_BASE remembers the start position in source in each loop.
2885      The loop will be exited when there's not enough source code
2886      (within macro ONE_MORE_BYTE), or when there's not enough
2887      destination area to produce a character (within macro
2888      EMIT_CHAR).  */
2889   unsigned char *src_base;
2890   Lisp_Object translation_table;
2891
2892   if (NILP (Venable_character_translation))
2893     translation_table = Qnil;
2894   else
2895     {
2896       translation_table = coding->translation_table_for_decode;
2897       if (NILP (translation_table))
2898         translation_table = Vstandard_translation_table_for_decode;
2899     }
2900
2901   coding->produced_char = 0;
2902   while (1)
2903     {
2904       int c, charset, c1, c2;
2905
2906       src_base = src;
2907       ONE_MORE_BYTE (c1);
2908
2909       if (c1 < 0x80)
2910         {
2911           charset = CHARSET_ASCII;
2912           if (c1 < 0x20)
2913             {
2914               if (c1 == '\r')
2915                 {
2916                   if (coding->eol_type == CODING_EOL_CRLF)
2917                     {
2918                       ONE_MORE_BYTE (c2);
2919                       if (c2 == '\n')
2920                         c1 = c2;
2921                       else
2922                         /* To process C2 again, SRC is subtracted by 1.  */
2923                         src--;
2924                     }
2925                   else if (coding->eol_type == CODING_EOL_CR)
2926                     c1 = '\n';
2927                 }
2928               else if (c1 == '\n'
2929                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2930                        && (coding->eol_type == CODING_EOL_CR
2931                            || coding->eol_type == CODING_EOL_CRLF))
2932                 {
2933                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2934                   goto label_end_of_loop;
2935                 }
2936             }
2937         }
2938       else
2939         {
2940           if (sjis_p)
2941             {
2942               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
2943                 goto label_invalid_code;
2944               if (c1 <= 0x9F || c1 >= 0xE0)
2945                 {
2946                   /* SJIS -> JISX0208 */
2947                   ONE_MORE_BYTE (c2);
2948                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2949                     goto label_invalid_code;
2950                   DECODE_SJIS (c1, c2, c1, c2);
2951                   charset = charset_jisx0208;
2952                 }
2953               else
2954                 /* SJIS -> JISX0201-Kana */
2955                 charset = charset_katakana_jisx0201;
2956             }
2957           else
2958             {
2959               /* BIG5 -> Big5 */
2960               if (c1 < 0xA0 || c1 > 0xFE)
2961                 goto label_invalid_code;
2962               ONE_MORE_BYTE (c2);
2963               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2964                 goto label_invalid_code;
2965               DECODE_BIG5 (c1, c2, charset, c1, c2);
2966             }
2967         }
2968
2969       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2970       EMIT_CHAR (c);
2971       continue;
2972
2973     label_invalid_code:
2974       coding->errors++;
2975       src = src_base;
2976       c = *src++;
2977       EMIT_CHAR (c);
2978     }
2979
2980  label_end_of_loop:
2981   coding->consumed = coding->consumed_char = src_base - source;
2982   coding->produced = dst - destination;
2983   return;
2984 }
2985
2986 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2987    This function can encode charsets `ascii', `katakana-jisx0201',
2988    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2989    are sure that all these charsets are registered as official charset
2990    (i.e. do not have extended leading-codes).  Characters of other
2991    charsets are produced without any encoding.  If SJIS_P is 1, encode
2992    SJIS text, else encode BIG5 text.  */
2993
2994 static void
2995 encode_coding_sjis_big5 (coding, source, destination,
2996                          src_bytes, dst_bytes, sjis_p)
2997      struct coding_system *coding;
2998      unsigned char *source, *destination;
2999      int src_bytes, dst_bytes;
3000      int sjis_p;
3001 {
3002   unsigned char *src = source;
3003   unsigned char *src_end = source + src_bytes;
3004   unsigned char *dst = destination;
3005   unsigned char *dst_end = destination + dst_bytes;
3006   /* SRC_BASE remembers the start position in source in each loop.
3007      The loop will be exited when there's not enough source text to
3008      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3009      there's not enough destination area to produce encoded codes
3010      (within macro EMIT_BYTES).  */
3011   unsigned char *src_base;
3012   Lisp_Object translation_table;
3013
3014   if (NILP (Venable_character_translation))
3015     translation_table = Qnil;
3016   else
3017     {
3018       translation_table = coding->translation_table_for_encode;
3019       if (NILP (translation_table))
3020         translation_table = Vstandard_translation_table_for_encode;
3021     }
3022
3023   while (1)
3024     {
3025       int c, charset, c1, c2;
3026
3027       src_base = src;
3028       ONE_MORE_CHAR (c);
3029
3030       /* Now encode the character C.  */
3031       if (SINGLE_BYTE_CHAR_P (c))
3032         {
3033           switch (c)
3034             {
3035             case '\r':
3036               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3037                 {
3038                   EMIT_ONE_BYTE (c);
3039                   break;
3040                 }
3041               c = '\n';
3042             case '\n':
3043               if (coding->eol_type == CODING_EOL_CRLF)
3044                 {
3045                   EMIT_TWO_BYTES ('\r', c);
3046                   break;
3047                 }
3048               else if (coding->eol_type == CODING_EOL_CR)
3049                 c = '\r';
3050             default:
3051               EMIT_ONE_BYTE (c);
3052             }
3053         }
3054       else
3055         {
3056           SPLIT_CHAR (c, charset, c1, c2);
3057           if (sjis_p)
3058             {
3059               if (charset == charset_jisx0208
3060                   || charset == charset_jisx0208_1978)
3061                 {
3062                   ENCODE_SJIS (c1, c2, c1, c2);
3063                   EMIT_TWO_BYTES (c1, c2);
3064                 }
3065               else if (charset == charset_katakana_jisx0201)
3066                 EMIT_ONE_BYTE (c1 | 0x80);
3067               else if (charset == charset_latin_jisx0201)
3068                 EMIT_ONE_BYTE (c1);
3069               else
3070                 /* There's no way other than producing the internal
3071                    codes as is.  */
3072                 EMIT_BYTES (src_base, src);
3073             }
3074           else
3075             {
3076               if (charset == charset_big5_1 || charset == charset_big5_2)
3077                 {
3078                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3079                   EMIT_TWO_BYTES (c1, c2);
3080                 }
3081               else
3082                 /* There's no way other than producing the internal
3083                    codes as is.  */
3084                 EMIT_BYTES (src_base, src);
3085             }
3086         }
3087       coding->consumed_char++;
3088     }
3089
3090  label_end_of_loop:
3091   coding->consumed = src_base - source;
3092   coding->produced = coding->produced_char = dst - destination;
3093 }
3094
3095 \f
3096 /*** 5. CCL handlers ***/
3097
3098 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3099    Check if a text is encoded in a coding system of which
3100    encoder/decoder are written in CCL program.  If it is, return
3101    CODING_CATEGORY_MASK_CCL, else return 0.  */
3102
3103 static int
3104 detect_coding_ccl (src, src_end, multibytep)
3105      unsigned char *src, *src_end;
3106      int multibytep;
3107 {
3108   unsigned char *valid;
3109   int c;
3110   /* Dummy for ONE_MORE_BYTE.  */
3111   struct coding_system dummy_coding;
3112   struct coding_system *coding = &dummy_coding;
3113
3114   /* No coding system is assigned to coding-category-ccl.  */
3115   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3116     return 0;
3117
3118   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3119   while (1)
3120     {
3121       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3122       if (! valid[c])
3123         return 0;
3124     }
3125  label_end_of_loop:
3126   return CODING_CATEGORY_MASK_CCL;
3127 }
3128
3129 \f
3130 /*** 6. End-of-line handlers ***/
3131
3132 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3133
3134 static void
3135 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3136      struct coding_system *coding;
3137      unsigned char *source, *destination;
3138      int src_bytes, dst_bytes;
3139 {
3140   unsigned char *src = source;
3141   unsigned char *dst = destination;
3142   unsigned char *src_end = src + src_bytes;
3143   unsigned char *dst_end = dst + dst_bytes;
3144   Lisp_Object translation_table;
3145   /* SRC_BASE remembers the start position in source in each loop.
3146      The loop will be exited when there's not enough source code
3147      (within macro ONE_MORE_BYTE), or when there's not enough
3148      destination area to produce a character (within macro
3149      EMIT_CHAR).  */
3150   unsigned char *src_base;
3151   int c;
3152
3153   translation_table = Qnil;
3154   switch (coding->eol_type)
3155     {
3156     case CODING_EOL_CRLF:
3157       while (1)
3158         {
3159           src_base = src;
3160           ONE_MORE_BYTE (c);
3161           if (c == '\r')
3162             {
3163               ONE_MORE_BYTE (c);
3164               if (c != '\n')
3165                 {
3166                   src--;
3167                   c = '\r';
3168                 }
3169             }
3170           else if (c == '\n'
3171                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3172             {
3173               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3174               goto label_end_of_loop;
3175             }
3176           EMIT_CHAR (c);
3177         }
3178       break;
3179
3180     case CODING_EOL_CR:
3181       while (1)
3182         {
3183           src_base = src;
3184           ONE_MORE_BYTE (c);
3185           if (c == '\n')
3186             {
3187               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3188                 {
3189                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3190                   goto label_end_of_loop;
3191                 }
3192             }
3193           else if (c == '\r')
3194             c = '\n';
3195           EMIT_CHAR (c);
3196         }
3197       break;
3198
3199     default:                    /* no need for EOL handling */
3200       while (1)
3201         {
3202           src_base = src;
3203           ONE_MORE_BYTE (c);
3204           EMIT_CHAR (c);
3205         }
3206     }
3207
3208  label_end_of_loop:
3209   coding->consumed = coding->consumed_char = src_base - source;
3210   coding->produced = dst - destination;
3211   return;
3212 }
3213
3214 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3215    format of end-of-line according to `coding->eol_type'.  It also
3216    convert multibyte form 8-bit characters to unibyte if
3217    CODING->src_multibyte is nonzero.  If `coding->mode &
3218    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3219    also means end-of-line.  */
3220
3221 static void
3222 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3223      struct coding_system *coding;
3224      const unsigned char *source;
3225      unsigned char *destination;
3226      int src_bytes, dst_bytes;
3227 {
3228   const unsigned char *src = source;
3229   unsigned char *dst = destination;
3230   const unsigned char *src_end = src + src_bytes;
3231   unsigned char *dst_end = dst + dst_bytes;
3232   Lisp_Object translation_table;
3233   /* SRC_BASE remembers the start position in source in each loop.
3234      The loop will be exited when there's not enough source text to
3235      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3236      there's not enough destination area to produce encoded codes
3237      (within macro EMIT_BYTES).  */
3238   const unsigned char *src_base;
3239   unsigned char *tmp;
3240   int c;
3241   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3242
3243   translation_table = Qnil;
3244   if (coding->src_multibyte
3245       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3246     {
3247       src_end--;
3248       src_bytes--;
3249       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3250     }
3251
3252   if (coding->eol_type == CODING_EOL_CRLF)
3253     {
3254       while (src < src_end)
3255         {
3256           src_base = src;
3257           c = *src++;
3258           if (c >= 0x20)
3259             EMIT_ONE_BYTE (c);
3260           else if (c == '\n' || (c == '\r' && selective_display))
3261             EMIT_TWO_BYTES ('\r', '\n');
3262           else
3263             EMIT_ONE_BYTE (c);
3264         }
3265       src_base = src;
3266     label_end_of_loop:
3267       ;
3268     }
3269   else
3270     {
3271       if (!dst_bytes || src_bytes <= dst_bytes)
3272         {
3273           safe_bcopy (src, dst, src_bytes);
3274           src_base = src_end;
3275           dst += src_bytes;
3276         }
3277       else
3278         {
3279           if (coding->src_multibyte
3280               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3281             dst_bytes--;
3282           safe_bcopy (src, dst, dst_bytes);
3283           src_base = src + dst_bytes;
3284           dst = destination + dst_bytes;
3285           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3286         }
3287       if (coding->eol_type == CODING_EOL_CR)
3288         {
3289           for (tmp = destination; tmp < dst; tmp++)
3290             if (*tmp == '\n') *tmp = '\r';
3291         }
3292       else if (selective_display)
3293         {
3294           for (tmp = destination; tmp < dst; tmp++)
3295             if (*tmp == '\r') *tmp = '\n';
3296         }
3297     }
3298   if (coding->src_multibyte)
3299     dst = destination + str_as_unibyte (destination, dst - destination);
3300
3301   coding->consumed = src_base - source;
3302   coding->produced = dst - destination;
3303   coding->produced_char = coding->produced;
3304 }
3305
3306 \f
3307 /*** 7. C library functions ***/
3308
3309 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3310    has a property `coding-system'.  The value of this property is a
3311    vector of length 5 (called the coding-vector).  Among elements of
3312    this vector, the first (element[0]) and the fifth (element[4])
3313    carry important information for decoding/encoding.  Before
3314    decoding/encoding, this information should be set in fields of a
3315    structure of type `coding_system'.
3316
3317    The value of the property `coding-system' can be a symbol of another
3318    subsidiary coding-system.  In that case, Emacs gets coding-vector
3319    from that symbol.
3320
3321    `element[0]' contains information to be set in `coding->type'.  The
3322    value and its meaning is as follows:
3323
3324    0 -- coding_type_emacs_mule
3325    1 -- coding_type_sjis
3326    2 -- coding_type_iso2022
3327    3 -- coding_type_big5
3328    4 -- coding_type_ccl encoder/decoder written in CCL
3329    nil -- coding_type_no_conversion
3330    t -- coding_type_undecided (automatic conversion on decoding,
3331                                no-conversion on encoding)
3332
3333    `element[4]' contains information to be set in `coding->flags' and
3334    `coding->spec'.  The meaning varies by `coding->type'.
3335
3336    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3337    of length 32 (of which the first 13 sub-elements are used now).
3338    Meanings of these sub-elements are:
3339
3340    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3341         If the value is an integer of valid charset, the charset is
3342         assumed to be designated to graphic register N initially.
3343
3344         If the value is minus, it is a minus value of charset which
3345         reserves graphic register N, which means that the charset is
3346         not designated initially but should be designated to graphic
3347         register N just before encoding a character in that charset.
3348
3349         If the value is nil, graphic register N is never used on
3350         encoding.
3351
3352    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3353         Each value takes t or nil.  See the section ISO2022 of
3354         `coding.h' for more information.
3355
3356    If `coding->type' is `coding_type_big5', element[4] is t to denote
3357    BIG5-ETen or nil to denote BIG5-HKU.
3358
3359    If `coding->type' takes the other value, element[4] is ignored.
3360
3361    Emacs Lisp's coding systems also carry information about format of
3362    end-of-line in a value of property `eol-type'.  If the value is
3363    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3364    means CODING_EOL_CR.  If it is not integer, it should be a vector
3365    of subsidiary coding systems of which property `eol-type' has one
3366    of the above values.
3367
3368 */
3369
3370 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3371    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3372    is setup so that no conversion is necessary and return -1, else
3373    return 0.  */
3374
3375 int
3376 setup_coding_system (coding_system, coding)
3377      Lisp_Object coding_system;
3378      struct coding_system *coding;
3379 {
3380   Lisp_Object coding_spec, coding_type, eol_type, plist;
3381   Lisp_Object val;
3382
3383   /* At first, zero clear all members.  */
3384   bzero (coding, sizeof (struct coding_system));
3385
3386   /* Initialize some fields required for all kinds of coding systems.  */
3387   coding->symbol = coding_system;
3388   coding->heading_ascii = -1;
3389   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3390   coding->composing = COMPOSITION_DISABLED;
3391   coding->cmp_data = NULL;
3392
3393   if (NILP (coding_system))
3394     goto label_invalid_coding_system;
3395
3396   coding_spec = Fget (coding_system, Qcoding_system);
3397
3398   if (!VECTORP (coding_spec)
3399       || XVECTOR (coding_spec)->size != 5
3400       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3401     goto label_invalid_coding_system;
3402
3403   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3404   if (VECTORP (eol_type))
3405     {
3406       coding->eol_type = CODING_EOL_UNDECIDED;
3407       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3408     }
3409   else if (XFASTINT (eol_type) == 1)
3410     {
3411       coding->eol_type = CODING_EOL_CRLF;
3412       coding->common_flags
3413         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3414     }
3415   else if (XFASTINT (eol_type) == 2)
3416     {
3417       coding->eol_type = CODING_EOL_CR;
3418       coding->common_flags
3419         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3420     }
3421   else
3422     coding->eol_type = CODING_EOL_LF;
3423
3424   coding_type = XVECTOR (coding_spec)->contents[0];
3425   /* Try short cut.  */
3426   if (SYMBOLP (coding_type))
3427     {
3428       if (EQ (coding_type, Qt))
3429         {
3430           coding->type = coding_type_undecided;
3431           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3432         }
3433       else
3434         coding->type = coding_type_no_conversion;
3435       /* Initialize this member.  Any thing other than
3436          CODING_CATEGORY_IDX_UTF_16_BE and
3437          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3438          special treatment in detect_eol.  */
3439       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3440
3441       return 0;
3442     }
3443
3444   /* Get values of coding system properties:
3445      `post-read-conversion', `pre-write-conversion',
3446      `translation-table-for-decode', `translation-table-for-encode'.  */
3447   plist = XVECTOR (coding_spec)->contents[3];
3448   /* Pre & post conversion functions should be disabled if
3449      inhibit_eol_conversion is nonzero.  This is the case that a code
3450      conversion function is called while those functions are running.  */
3451   if (! inhibit_pre_post_conversion)
3452     {
3453       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3454       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3455     }
3456   val = Fplist_get (plist, Qtranslation_table_for_decode);
3457   if (SYMBOLP (val))
3458     val = Fget (val, Qtranslation_table_for_decode);
3459   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3460   val = Fplist_get (plist, Qtranslation_table_for_encode);
3461   if (SYMBOLP (val))
3462     val = Fget (val, Qtranslation_table_for_encode);
3463   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3464   val = Fplist_get (plist, Qcoding_category);
3465   if (!NILP (val))
3466     {
3467       val = Fget (val, Qcoding_category_index);
3468       if (INTEGERP (val))
3469         coding->category_idx = XINT (val);
3470       else
3471         goto label_invalid_coding_system;
3472     }
3473   else
3474     goto label_invalid_coding_system;
3475
3476   /* If the coding system has non-nil `composition' property, enable
3477      composition handling.  */
3478   val = Fplist_get (plist, Qcomposition);
3479   if (!NILP (val))
3480     coding->composing = COMPOSITION_NO;
3481
3482   switch (XFASTINT (coding_type))
3483     {
3484     case 0:
3485       coding->type = coding_type_emacs_mule;
3486       coding->common_flags
3487         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3488       if (!NILP (coding->post_read_conversion))
3489         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3490       if (!NILP (coding->pre_write_conversion))
3491         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3492       break;
3493
3494     case 1:
3495       coding->type = coding_type_sjis;
3496       coding->common_flags
3497         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3498       break;
3499
3500     case 2:
3501       coding->type = coding_type_iso2022;
3502       coding->common_flags
3503         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3504       {
3505         Lisp_Object val, temp;
3506         Lisp_Object *flags;
3507         int i, charset, reg_bits = 0;
3508
3509         val = XVECTOR (coding_spec)->contents[4];
3510
3511         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3512           goto label_invalid_coding_system;
3513
3514         flags = XVECTOR (val)->contents;
3515         coding->flags
3516           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3517              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3518              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3519              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3520              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3521              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3522              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3523              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3524              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3525              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3526              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3527              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3528              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3529              );
3530
3531         /* Invoke graphic register 0 to plane 0.  */
3532         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3533         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3534         CODING_SPEC_ISO_INVOCATION (coding, 1)
3535           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3536         /* Not single shifting at first.  */
3537         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3538         /* Beginning of buffer should also be regarded as bol. */
3539         CODING_SPEC_ISO_BOL (coding) = 1;
3540
3541         for (charset = 0; charset <= MAX_CHARSET; charset++)
3542           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3543         val = Vcharset_revision_alist;
3544         while (CONSP (val))
3545           {
3546             charset = get_charset_id (Fcar_safe (XCAR (val)));
3547             if (charset >= 0
3548                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3549                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3550               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3551             val = XCDR (val);
3552           }
3553
3554         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3555            FLAGS[REG] can be one of below:
3556                 integer CHARSET: CHARSET occupies register I,
3557                 t: designate nothing to REG initially, but can be used
3558                   by any charsets,
3559                 list of integer, nil, or t: designate the first
3560                   element (if integer) to REG initially, the remaining
3561                   elements (if integer) is designated to REG on request,
3562                   if an element is t, REG can be used by any charsets,
3563                 nil: REG is never used.  */
3564         for (charset = 0; charset <= MAX_CHARSET; charset++)
3565           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3566             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3567         for (i = 0; i < 4; i++)
3568           {
3569             if ((INTEGERP (flags[i])
3570                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3571                 || (charset = get_charset_id (flags[i])) >= 0)
3572               {
3573                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3574                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3575               }
3576             else if (EQ (flags[i], Qt))
3577               {
3578                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3579                 reg_bits |= 1 << i;
3580                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3581               }
3582             else if (CONSP (flags[i]))
3583               {
3584                 Lisp_Object tail;
3585                 tail = flags[i];
3586
3587                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3588                 if ((INTEGERP (XCAR (tail))
3589                      && (charset = XINT (XCAR (tail)),
3590                          CHARSET_VALID_P (charset)))
3591                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3592                   {
3593                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3594                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3595                   }
3596                 else
3597                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3598                 tail = XCDR (tail);
3599                 while (CONSP (tail))
3600                   {
3601                     if ((INTEGERP (XCAR (tail))
3602                          && (charset = XINT (XCAR (tail)),
3603                              CHARSET_VALID_P (charset)))
3604                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3605                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3606                         = i;
3607                     else if (EQ (XCAR (tail), Qt))
3608                       reg_bits |= 1 << i;
3609                     tail = XCDR (tail);
3610                   }
3611               }
3612             else
3613               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3614
3615             CODING_SPEC_ISO_DESIGNATION (coding, i)
3616               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3617           }
3618
3619         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3620           {
3621             /* REG 1 can be used only by locking shift in 7-bit env.  */
3622             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3623               reg_bits &= ~2;
3624             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3625               /* Without any shifting, only REG 0 and 1 can be used.  */
3626               reg_bits &= 3;
3627           }
3628
3629         if (reg_bits)
3630           for (charset = 0; charset <= MAX_CHARSET; charset++)
3631             {
3632               if (CHARSET_DEFINED_P (charset)
3633                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3634                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3635                 {
3636                   /* There exist some default graphic registers to be
3637                      used by CHARSET.  */
3638
3639                   /* We had better avoid designating a charset of
3640                      CHARS96 to REG 0 as far as possible.  */
3641                   if (CHARSET_CHARS (charset) == 96)
3642                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3643                       = (reg_bits & 2
3644                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3645                   else
3646                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3647                       = (reg_bits & 1
3648                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3649                 }
3650             }
3651       }
3652       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3653       coding->spec.iso2022.last_invalid_designation_register = -1;
3654       break;
3655
3656     case 3:
3657       coding->type = coding_type_big5;
3658       coding->common_flags
3659         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3660       coding->flags
3661         = (NILP (XVECTOR (coding_spec)->contents[4])
3662            ? CODING_FLAG_BIG5_HKU
3663            : CODING_FLAG_BIG5_ETEN);
3664       break;
3665
3666     case 4:
3667       coding->type = coding_type_ccl;
3668       coding->common_flags
3669         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3670       {
3671         val = XVECTOR (coding_spec)->contents[4];
3672         if (! CONSP (val)
3673             || setup_ccl_program (&(coding->spec.ccl.decoder),
3674                                   XCAR (val)) < 0
3675             || setup_ccl_program (&(coding->spec.ccl.encoder),
3676                                   XCDR (val)) < 0)
3677           goto label_invalid_coding_system;
3678
3679         bzero (coding->spec.ccl.valid_codes, 256);
3680         val = Fplist_get (plist, Qvalid_codes);
3681         if (CONSP (val))
3682           {
3683             Lisp_Object this;
3684
3685             for (; CONSP (val); val = XCDR (val))
3686               {
3687                 this = XCAR (val);
3688                 if (INTEGERP (this)
3689                     && XINT (this) >= 0 && XINT (this) < 256)
3690                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3691                 else if (CONSP (this)
3692                          && INTEGERP (XCAR (this))
3693                          && INTEGERP (XCDR (this)))
3694                   {
3695                     int start = XINT (XCAR (this));
3696                     int end = XINT (XCDR (this));
3697
3698                     if (start >= 0 && start <= end && end < 256)
3699                       while (start <= end)
3700                         coding->spec.ccl.valid_codes[start++] = 1;
3701                   }
3702               }
3703           }
3704       }
3705       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3706       coding->spec.ccl.cr_carryover = 0;
3707       coding->spec.ccl.eight_bit_carryover[0] = 0;
3708       break;
3709
3710     case 5:
3711       coding->type = coding_type_raw_text;
3712       break;
3713
3714     default:
3715       goto label_invalid_coding_system;
3716     }
3717   return 0;
3718
3719  label_invalid_coding_system:
3720   coding->type = coding_type_no_conversion;
3721   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3722   coding->common_flags = 0;
3723   coding->eol_type = CODING_EOL_LF;
3724   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3725   return -1;
3726 }
3727
3728 /* Free memory blocks allocated for storing composition information.  */
3729
3730 void
3731 coding_free_composition_data (coding)
3732      struct coding_system *coding;
3733 {
3734   struct composition_data *cmp_data = coding->cmp_data, *next;
3735
3736   if (!cmp_data)
3737     return;
3738   /* Memory blocks are chained.  At first, rewind to the first, then,
3739      free blocks one by one.  */
3740   while (cmp_data->prev)
3741     cmp_data = cmp_data->prev;
3742   while (cmp_data)
3743     {
3744       next = cmp_data->next;
3745       xfree (cmp_data);
3746       cmp_data = next;
3747     }
3748   coding->cmp_data = NULL;
3749 }
3750
3751 /* Set `char_offset' member of all memory blocks pointed by
3752    coding->cmp_data to POS.  */
3753
3754 void
3755 coding_adjust_composition_offset (coding, pos)
3756      struct coding_system *coding;
3757      int pos;
3758 {
3759   struct composition_data *cmp_data;
3760
3761   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3762     cmp_data->char_offset = pos;
3763 }
3764
3765 /* Setup raw-text or one of its subsidiaries in the structure
3766    coding_system CODING according to the already setup value eol_type
3767    in CODING.  CODING should be setup for some coding system in
3768    advance.  */
3769
3770 void
3771 setup_raw_text_coding_system (coding)
3772      struct coding_system *coding;
3773 {
3774   if (coding->type != coding_type_raw_text)
3775     {
3776       coding->symbol = Qraw_text;
3777       coding->type = coding_type_raw_text;
3778       if (coding->eol_type != CODING_EOL_UNDECIDED)
3779         {
3780           Lisp_Object subsidiaries;
3781           subsidiaries = Fget (Qraw_text, Qeol_type);
3782
3783           if (VECTORP (subsidiaries)
3784               && XVECTOR (subsidiaries)->size == 3)
3785             coding->symbol
3786               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3787         }
3788       setup_coding_system (coding->symbol, coding);
3789     }
3790   return;
3791 }
3792
3793 /* Emacs has a mechanism to automatically detect a coding system if it
3794    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3795    it's impossible to distinguish some coding systems accurately
3796    because they use the same range of codes.  So, at first, coding
3797    systems are categorized into 7, those are:
3798
3799    o coding-category-emacs-mule
3800
3801         The category for a coding system which has the same code range
3802         as Emacs' internal format.  Assigned the coding-system (Lisp
3803         symbol) `emacs-mule' by default.
3804
3805    o coding-category-sjis
3806
3807         The category for a coding system which has the same code range
3808         as SJIS.  Assigned the coding-system (Lisp
3809         symbol) `japanese-shift-jis' by default.
3810
3811    o coding-category-iso-7
3812
3813         The category for a coding system which has the same code range
3814         as ISO2022 of 7-bit environment.  This doesn't use any locking
3815         shift and single shift functions.  This can encode/decode all
3816         charsets.  Assigned the coding-system (Lisp symbol)
3817         `iso-2022-7bit' by default.
3818
3819    o coding-category-iso-7-tight
3820
3821         Same as coding-category-iso-7 except that this can
3822         encode/decode only the specified charsets.
3823
3824    o coding-category-iso-8-1
3825
3826         The category for a coding system which has the same code range
3827         as ISO2022 of 8-bit environment and graphic plane 1 used only
3828         for DIMENSION1 charset.  This doesn't use any locking shift
3829         and single shift functions.  Assigned the coding-system (Lisp
3830         symbol) `iso-latin-1' by default.
3831
3832    o coding-category-iso-8-2
3833
3834         The category for a coding system which has the same code range
3835         as ISO2022 of 8-bit environment and graphic plane 1 used only
3836         for DIMENSION2 charset.  This doesn't use any locking shift
3837         and single shift functions.  Assigned the coding-system (Lisp
3838         symbol) `japanese-iso-8bit' by default.
3839
3840    o coding-category-iso-7-else
3841
3842         The category for a coding system which has the same code range
3843         as ISO2022 of 7-bit environment but uses locking shift or
3844         single shift functions.  Assigned the coding-system (Lisp
3845         symbol) `iso-2022-7bit-lock' by default.
3846
3847    o coding-category-iso-8-else
3848
3849         The category for a coding system which has the same code range
3850         as ISO2022 of 8-bit environment but uses locking shift or
3851         single shift functions.  Assigned the coding-system (Lisp
3852         symbol) `iso-2022-8bit-ss2' by default.
3853
3854    o coding-category-big5
3855
3856         The category for a coding system which has the same code range
3857         as BIG5.  Assigned the coding-system (Lisp symbol)
3858         `cn-big5' by default.
3859
3860    o coding-category-utf-8
3861
3862         The category for a coding system which has the same code range
3863         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3864         symbol) `utf-8' by default.
3865
3866    o coding-category-utf-16-be
3867
3868         The category for a coding system in which a text has an
3869         Unicode signature (cf. Unicode Standard) in the order of BIG
3870         endian at the head.  Assigned the coding-system (Lisp symbol)
3871         `utf-16-be' by default.
3872
3873    o coding-category-utf-16-le
3874
3875         The category for a coding system in which a text has an
3876         Unicode signature (cf. Unicode Standard) in the order of
3877         LITTLE endian at the head.  Assigned the coding-system (Lisp
3878         symbol) `utf-16-le' by default.
3879
3880    o coding-category-ccl
3881
3882         The category for a coding system of which encoder/decoder is
3883         written in CCL programs.  The default value is nil, i.e., no
3884         coding system is assigned.
3885
3886    o coding-category-binary
3887
3888         The category for a coding system not categorized in any of the
3889         above.  Assigned the coding-system (Lisp symbol)
3890         `no-conversion' by default.
3891
3892    Each of them is a Lisp symbol and the value is an actual
3893    `coding-system' (this is also a Lisp symbol) assigned by a user.
3894    What Emacs does actually is to detect a category of coding system.
3895    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3896    decide a single possible category, it selects a category of the
3897    highest priority.  Priorities of categories are also specified by a
3898    user in a Lisp variable `coding-category-list'.
3899
3900 */
3901
3902 static
3903 int ascii_skip_code[256];
3904
3905 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3906    If it detects possible coding systems, return an integer in which
3907    appropriate flag bits are set.  Flag bits are defined by macros
3908    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3909    it should point the table `coding_priorities'.  In that case, only
3910    the flag bit for a coding system of the highest priority is set in
3911    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
3912    range 0x80..0x9F are in multibyte form.
3913
3914    How many ASCII characters are at the head is returned as *SKIP.  */
3915
3916 static int
3917 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
3918      unsigned char *source;
3919      int src_bytes, *priorities, *skip;
3920      int multibytep;
3921 {
3922   register unsigned char c;
3923   unsigned char *src = source, *src_end = source + src_bytes;
3924   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3925   int i;
3926
3927   /* At first, skip all ASCII characters and control characters except
3928      for three ISO2022 specific control characters.  */
3929   ascii_skip_code[ISO_CODE_SO] = 0;
3930   ascii_skip_code[ISO_CODE_SI] = 0;
3931   ascii_skip_code[ISO_CODE_ESC] = 0;
3932
3933  label_loop_detect_coding:
3934   while (src < src_end && ascii_skip_code[*src]) src++;
3935   *skip = src - source;
3936
3937   if (src >= src_end)
3938     /* We found nothing other than ASCII.  There's nothing to do.  */
3939     return 0;
3940
3941   c = *src;
3942   /* The text seems to be encoded in some multilingual coding system.
3943      Now, try to find in which coding system the text is encoded.  */
3944   if (c < 0x80)
3945     {
3946       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3947       /* C is an ISO2022 specific control code of C0.  */
3948       mask = detect_coding_iso2022 (src, src_end, multibytep);
3949       if (mask == 0)
3950         {
3951           /* No valid ISO2022 code follows C.  Try again.  */
3952           src++;
3953           if (c == ISO_CODE_ESC)
3954             ascii_skip_code[ISO_CODE_ESC] = 1;
3955           else
3956             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3957           goto label_loop_detect_coding;
3958         }
3959       if (priorities)
3960         {
3961           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3962             {
3963               if (mask & priorities[i])
3964                 return priorities[i];
3965             }
3966           return CODING_CATEGORY_MASK_RAW_TEXT;
3967         }
3968     }
3969   else
3970     {
3971       int try;
3972
3973       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
3974         c = src[1] - 0x20;
3975
3976       if (c < 0xA0)
3977         {
3978           /* C is the first byte of SJIS character code,
3979              or a leading-code of Emacs' internal format (emacs-mule),
3980              or the first byte of UTF-16.  */
3981           try = (CODING_CATEGORY_MASK_SJIS
3982                   | CODING_CATEGORY_MASK_EMACS_MULE
3983                   | CODING_CATEGORY_MASK_UTF_16_BE
3984                   | CODING_CATEGORY_MASK_UTF_16_LE);
3985
3986           /* Or, if C is a special latin extra code,
3987              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3988              or is an ISO2022 control-sequence-introducer (CSI),
3989              we should also consider the possibility of ISO2022 codings.  */
3990           if ((VECTORP (Vlatin_extra_code_table)
3991                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3992               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3993               || (c == ISO_CODE_CSI
3994                   && (src < src_end
3995                       && (*src == ']'
3996                           || ((*src == '0' || *src == '1' || *src == '2')
3997                               && src + 1 < src_end
3998                               && src[1] == ']')))))
3999             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4000                      | CODING_CATEGORY_MASK_ISO_8BIT);
4001         }
4002       else
4003         /* C is a character of ISO2022 in graphic plane right,
4004            or a SJIS's 1-byte character code (i.e. JISX0201),
4005            or the first byte of BIG5's 2-byte code,
4006            or the first byte of UTF-8/16.  */
4007         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4008                 | CODING_CATEGORY_MASK_ISO_8BIT
4009                 | CODING_CATEGORY_MASK_SJIS
4010                 | CODING_CATEGORY_MASK_BIG5
4011                 | CODING_CATEGORY_MASK_UTF_8
4012                 | CODING_CATEGORY_MASK_UTF_16_BE
4013                 | CODING_CATEGORY_MASK_UTF_16_LE);
4014
4015       /* Or, we may have to consider the possibility of CCL.  */
4016       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4017           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4018               ->spec.ccl.valid_codes)[c])
4019         try |= CODING_CATEGORY_MASK_CCL;
4020
4021       mask = 0;
4022       utf16_examined_p = iso2022_examined_p = 0;
4023       if (priorities)
4024         {
4025           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4026             {
4027               if (!iso2022_examined_p
4028                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4029                 {
4030                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4031                   iso2022_examined_p = 1;
4032                 }
4033               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4034                 mask |= detect_coding_sjis (src, src_end, multibytep);
4035               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4036                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4037               else if (!utf16_examined_p
4038                        && (priorities[i] & try &
4039                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4040                 {
4041                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4042                   utf16_examined_p = 1;
4043                 }
4044               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4045                 mask |= detect_coding_big5 (src, src_end, multibytep);
4046               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4047                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4048               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4049                 mask |= detect_coding_ccl (src, src_end, multibytep);
4050               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4051                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4052               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4053                 mask |= CODING_CATEGORY_MASK_BINARY;
4054               if (mask & priorities[i])
4055                 return priorities[i];
4056             }
4057           return CODING_CATEGORY_MASK_RAW_TEXT;
4058         }
4059       if (try & CODING_CATEGORY_MASK_ISO)
4060         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4061       if (try & CODING_CATEGORY_MASK_SJIS)
4062         mask |= detect_coding_sjis (src, src_end, multibytep);
4063       if (try & CODING_CATEGORY_MASK_BIG5)
4064         mask |= detect_coding_big5 (src, src_end, multibytep);
4065       if (try & CODING_CATEGORY_MASK_UTF_8)
4066         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4067       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4068         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4069       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4070         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4071       if (try & CODING_CATEGORY_MASK_CCL)
4072         mask |= detect_coding_ccl (src, src_end, multibytep);
4073     }
4074   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4075 }
4076
4077 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4078    The information of the detected coding system is set in CODING.  */
4079
4080 void
4081 detect_coding (coding, src, src_bytes)
4082      struct coding_system *coding;
4083      const unsigned char *src;
4084      int src_bytes;
4085 {
4086   unsigned int idx;
4087   int skip, mask;
4088   Lisp_Object val;
4089
4090   val = Vcoding_category_list;
4091   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4092                              coding->src_multibyte);
4093   coding->heading_ascii = skip;
4094
4095   if (!mask) return;
4096
4097   /* We found a single coding system of the highest priority in MASK.  */
4098   idx = 0;
4099   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4100   if (! mask)
4101     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4102
4103   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4104
4105   if (coding->eol_type != CODING_EOL_UNDECIDED)
4106     {
4107       Lisp_Object tmp;
4108
4109       tmp = Fget (val, Qeol_type);
4110       if (VECTORP (tmp))
4111         val = XVECTOR (tmp)->contents[coding->eol_type];
4112     }
4113
4114   /* Setup this new coding system while preserving some slots.  */
4115   {
4116     int src_multibyte = coding->src_multibyte;
4117     int dst_multibyte = coding->dst_multibyte;
4118
4119     setup_coding_system (val, coding);
4120     coding->src_multibyte = src_multibyte;
4121     coding->dst_multibyte = dst_multibyte;
4122     coding->heading_ascii = skip;
4123   }
4124 }
4125
4126 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4127    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4128    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4129
4130    How many non-eol characters are at the head is returned as *SKIP.  */
4131
4132 #define MAX_EOL_CHECK_COUNT 3
4133
4134 static int
4135 detect_eol_type (source, src_bytes, skip)
4136      unsigned char *source;
4137      int src_bytes, *skip;
4138 {
4139   unsigned char *src = source, *src_end = src + src_bytes;
4140   unsigned char c;
4141   int total = 0;                /* How many end-of-lines are found so far.  */
4142   int eol_type = CODING_EOL_UNDECIDED;
4143   int this_eol_type;
4144
4145   *skip = 0;
4146
4147   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4148     {
4149       c = *src++;
4150       if (c == '\n' || c == '\r')
4151         {
4152           if (*skip == 0)
4153             *skip = src - 1 - source;
4154           total++;
4155           if (c == '\n')
4156             this_eol_type = CODING_EOL_LF;
4157           else if (src >= src_end || *src != '\n')
4158             this_eol_type = CODING_EOL_CR;
4159           else
4160             this_eol_type = CODING_EOL_CRLF, src++;
4161
4162           if (eol_type == CODING_EOL_UNDECIDED)
4163             /* This is the first end-of-line.  */
4164             eol_type = this_eol_type;
4165           else if (eol_type != this_eol_type)
4166             {
4167               /* The found type is different from what found before.  */
4168               eol_type = CODING_EOL_INCONSISTENT;
4169               break;
4170             }
4171         }
4172     }
4173
4174   if (*skip == 0)
4175     *skip = src_end - source;
4176   return eol_type;
4177 }
4178
4179 /* Like detect_eol_type, but detect EOL type in 2-octet
4180    big-endian/little-endian format for coding systems utf-16-be and
4181    utf-16-le.  */
4182
4183 static int
4184 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4185      unsigned char *source;
4186      int src_bytes, *skip, big_endian_p;
4187 {
4188   unsigned char *src = source, *src_end = src + src_bytes;
4189   unsigned int c1, c2;
4190   int total = 0;                /* How many end-of-lines are found so far.  */
4191   int eol_type = CODING_EOL_UNDECIDED;
4192   int this_eol_type;
4193   int msb, lsb;
4194
4195   if (big_endian_p)
4196     msb = 0, lsb = 1;
4197   else
4198     msb = 1, lsb = 0;
4199
4200   *skip = 0;
4201
4202   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4203     {
4204       c1 = (src[msb] << 8) | (src[lsb]);
4205       src += 2;
4206
4207       if (c1 == '\n' || c1 == '\r')
4208         {
4209           if (*skip == 0)
4210             *skip = src - 2 - source;
4211           total++;
4212           if (c1 == '\n')
4213             {
4214               this_eol_type = CODING_EOL_LF;
4215             }
4216           else
4217             {
4218               if ((src + 1) >= src_end)
4219                 {
4220                   this_eol_type = CODING_EOL_CR;
4221                 }
4222               else
4223                 {
4224                   c2 = (src[msb] << 8) | (src[lsb]);
4225                   if (c2 == '\n')
4226                     this_eol_type = CODING_EOL_CRLF, src += 2;
4227                   else
4228                     this_eol_type = CODING_EOL_CR;
4229                 }
4230             }
4231
4232           if (eol_type == CODING_EOL_UNDECIDED)
4233             /* This is the first end-of-line.  */
4234             eol_type = this_eol_type;
4235           else if (eol_type != this_eol_type)
4236             {
4237               /* The found type is different from what found before.  */
4238               eol_type = CODING_EOL_INCONSISTENT;
4239               break;
4240             }
4241         }
4242     }
4243
4244   if (*skip == 0)
4245     *skip = src_end - source;
4246   return eol_type;
4247 }
4248
4249 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4250    is encoded.  If it detects an appropriate format of end-of-line, it
4251    sets the information in *CODING.  */
4252
4253 void
4254 detect_eol (coding, src, src_bytes)
4255      struct coding_system *coding;
4256      const unsigned char *src;
4257      int src_bytes;
4258 {
4259   Lisp_Object val;
4260   int skip;
4261   int eol_type;
4262
4263   switch (coding->category_idx)
4264     {
4265     case CODING_CATEGORY_IDX_UTF_16_BE:
4266       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4267       break;
4268     case CODING_CATEGORY_IDX_UTF_16_LE:
4269       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4270       break;
4271     default:
4272       eol_type = detect_eol_type (src, src_bytes, &skip);
4273       break;
4274     }
4275
4276   if (coding->heading_ascii > skip)
4277     coding->heading_ascii = skip;
4278   else
4279     skip = coding->heading_ascii;
4280
4281   if (eol_type == CODING_EOL_UNDECIDED)
4282     return;
4283   if (eol_type == CODING_EOL_INCONSISTENT)
4284     {
4285 #if 0
4286       /* This code is suppressed until we find a better way to
4287          distinguish raw text file and binary file.  */
4288
4289       /* If we have already detected that the coding is raw-text, the
4290          coding should actually be no-conversion.  */
4291       if (coding->type == coding_type_raw_text)
4292         {
4293           setup_coding_system (Qno_conversion, coding);
4294           return;
4295         }
4296       /* Else, let's decode only text code anyway.  */
4297 #endif /* 0 */
4298       eol_type = CODING_EOL_LF;
4299     }
4300
4301   val = Fget (coding->symbol, Qeol_type);
4302   if (VECTORP (val) && XVECTOR (val)->size == 3)
4303     {
4304       int src_multibyte = coding->src_multibyte;
4305       int dst_multibyte = coding->dst_multibyte;
4306       struct composition_data *cmp_data = coding->cmp_data;
4307
4308       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4309       coding->src_multibyte = src_multibyte;
4310       coding->dst_multibyte = dst_multibyte;
4311       coding->heading_ascii = skip;
4312       coding->cmp_data = cmp_data;
4313     }
4314 }
4315
4316 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4317
4318 #define DECODING_BUFFER_MAG(coding)                     \
4319   (coding->type == coding_type_iso2022                  \
4320    ? 3                                                  \
4321    : (coding->type == coding_type_ccl                   \
4322       ? coding->spec.ccl.decoder.buf_magnification      \
4323       : 2))
4324
4325 /* Return maximum size (bytes) of a buffer enough for decoding
4326    SRC_BYTES of text encoded in CODING.  */
4327
4328 int
4329 decoding_buffer_size (coding, src_bytes)
4330      struct coding_system *coding;
4331      int src_bytes;
4332 {
4333   return (src_bytes * DECODING_BUFFER_MAG (coding)
4334           + CONVERSION_BUFFER_EXTRA_ROOM);
4335 }
4336
4337 /* Return maximum size (bytes) of a buffer enough for encoding
4338    SRC_BYTES of text to CODING.  */
4339
4340 int
4341 encoding_buffer_size (coding, src_bytes)
4342      struct coding_system *coding;
4343      int src_bytes;
4344 {
4345   int magnification;
4346
4347   if (coding->type == coding_type_ccl)
4348     magnification = coding->spec.ccl.encoder.buf_magnification;
4349   else if (CODING_REQUIRE_ENCODING (coding))
4350     magnification = 3;
4351   else
4352     magnification = 1;
4353
4354   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4355 }
4356
4357 /* Working buffer for code conversion.  */
4358 struct conversion_buffer
4359 {
4360   int size;                     /* size of data.  */
4361   int on_stack;                 /* 1 if allocated by alloca.  */
4362   unsigned char *data;
4363 };
4364
4365 /* Don't use alloca for allocating memory space larger than this, lest
4366    we overflow their stack.  */
4367 #define MAX_ALLOCA 16*1024
4368
4369 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4370 #define allocate_conversion_buffer(buf, len)            \
4371   do {                                                  \
4372     if (len < MAX_ALLOCA)                               \
4373       {                                                 \
4374         buf.data = (unsigned char *) alloca (len);      \
4375         buf.on_stack = 1;                               \
4376       }                                                 \
4377     else                                                \
4378       {                                                 \
4379         buf.data = (unsigned char *) xmalloc (len);     \
4380         buf.on_stack = 0;                               \
4381       }                                                 \
4382     buf.size = len;                                     \
4383   } while (0)
4384
4385 /* Double the allocated memory for *BUF.  */
4386 static void
4387 extend_conversion_buffer (buf)
4388      struct conversion_buffer *buf;
4389 {
4390   if (buf->on_stack)
4391     {
4392       unsigned char *save = buf->data;
4393       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4394       bcopy (save, buf->data, buf->size);
4395       buf->on_stack = 0;
4396     }
4397   else
4398     {
4399       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4400     }
4401   buf->size *= 2;
4402 }
4403
4404 /* Free the allocated memory for BUF if it is not on stack.  */
4405 static void
4406 free_conversion_buffer (buf)
4407      struct conversion_buffer *buf;
4408 {
4409   if (!buf->on_stack)
4410     xfree (buf->data);
4411 }
4412
4413 int
4414 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4415      struct coding_system *coding;
4416      unsigned char *source, *destination;
4417      int src_bytes, dst_bytes, encodep;
4418 {
4419   struct ccl_program *ccl
4420     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4421   unsigned char *dst = destination;
4422
4423   ccl->suppress_error = coding->suppress_error;
4424   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4425   if (encodep)
4426     {
4427       /* On encoding, EOL format is converted within ccl_driver.  For
4428          that, setup proper information in the structure CCL.  */
4429       ccl->eol_type = coding->eol_type;
4430       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4431         ccl->eol_type = CODING_EOL_LF;
4432       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4433     }
4434   ccl->multibyte = coding->src_multibyte;
4435   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4436     {
4437       /* Move carryover bytes to DESTINATION.  */
4438       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4439       while (*p)
4440         *dst++ = *p++;
4441       coding->spec.ccl.eight_bit_carryover[0] = 0;
4442       if (dst_bytes)
4443         dst_bytes -= dst - destination;
4444     }
4445
4446   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4447                                   &(coding->consumed))
4448                       + dst - destination);
4449
4450   if (encodep)
4451     {
4452       coding->produced_char = coding->produced;
4453       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4454     }
4455   else if (!ccl->eight_bit_control)
4456     {
4457       /* The produced bytes forms a valid multibyte sequence. */
4458       coding->produced_char
4459         = multibyte_chars_in_text (destination, coding->produced);
4460       coding->spec.ccl.eight_bit_carryover[0] = 0;
4461     }
4462   else
4463     {
4464       /* On decoding, the destination should always multibyte.  But,
4465          CCL program might have been generated an invalid multibyte
4466          sequence.  Here we make such a sequence valid as
4467          multibyte.  */
4468       int bytes
4469         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4470
4471       if ((coding->consumed < src_bytes
4472            || !ccl->last_block)
4473           && coding->produced >= 1
4474           && destination[coding->produced - 1] >= 0x80)
4475         {
4476           /* We should not convert the tailing 8-bit codes to
4477              multibyte form even if they doesn't form a valid
4478              multibyte sequence.  They may form a valid sequence in
4479              the next call.  */
4480           int carryover = 0;
4481
4482           if (destination[coding->produced - 1] < 0xA0)
4483             carryover = 1;
4484           else if (coding->produced >= 2)
4485             {
4486               if (destination[coding->produced - 2] >= 0x80)
4487                 {
4488                   if (destination[coding->produced - 2] < 0xA0)
4489                     carryover = 2;
4490                   else if (coding->produced >= 3
4491                            && destination[coding->produced - 3] >= 0x80
4492                            && destination[coding->produced - 3] < 0xA0)
4493                     carryover = 3;
4494                 }
4495             }
4496           if (carryover > 0)
4497             {
4498               BCOPY_SHORT (destination + coding->produced - carryover,
4499                            coding->spec.ccl.eight_bit_carryover,
4500                            carryover);
4501               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4502               coding->produced -= carryover;
4503             }
4504         }
4505       coding->produced = str_as_multibyte (destination, bytes,
4506                                            coding->produced,
4507                                            &(coding->produced_char));
4508     }
4509
4510   switch (ccl->status)
4511     {
4512     case CCL_STAT_SUSPEND_BY_SRC:
4513       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4514       break;
4515     case CCL_STAT_SUSPEND_BY_DST:
4516       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4517       break;
4518     case CCL_STAT_QUIT:
4519     case CCL_STAT_INVALID_CMD:
4520       coding->result = CODING_FINISH_INTERRUPT;
4521       break;
4522     default:
4523       coding->result = CODING_FINISH_NORMAL;
4524       break;
4525     }
4526   return coding->result;
4527 }
4528
4529 /* Decode EOL format of the text at PTR of BYTES length destructively
4530    according to CODING->eol_type.  This is called after the CCL
4531    program produced a decoded text at PTR.  If we do CRLF->LF
4532    conversion, update CODING->produced and CODING->produced_char.  */
4533
4534 static void
4535 decode_eol_post_ccl (coding, ptr, bytes)
4536      struct coding_system *coding;
4537      unsigned char *ptr;
4538      int bytes;
4539 {
4540   Lisp_Object val, saved_coding_symbol;
4541   unsigned char *pend = ptr + bytes;
4542   int dummy;
4543
4544   /* Remember the current coding system symbol.  We set it back when
4545      an inconsistent EOL is found so that `last-coding-system-used' is
4546      set to the coding system that doesn't specify EOL conversion.  */
4547   saved_coding_symbol = coding->symbol;
4548
4549   coding->spec.ccl.cr_carryover = 0;
4550   if (coding->eol_type == CODING_EOL_UNDECIDED)
4551     {
4552       /* Here, to avoid the call of setup_coding_system, we directly
4553          call detect_eol_type.  */
4554       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4555       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4556         coding->eol_type = CODING_EOL_LF;
4557       if (coding->eol_type != CODING_EOL_UNDECIDED)
4558         {
4559           val = Fget (coding->symbol, Qeol_type);
4560           if (VECTORP (val) && XVECTOR (val)->size == 3)
4561             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4562         }
4563       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4564     }
4565
4566   if (coding->eol_type == CODING_EOL_LF
4567       || coding->eol_type == CODING_EOL_UNDECIDED)
4568     {
4569       /* We have nothing to do.  */
4570       ptr = pend;
4571     }
4572   else if (coding->eol_type == CODING_EOL_CRLF)
4573     {
4574       unsigned char *pstart = ptr, *p = ptr;
4575
4576       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4577           && *(pend - 1) == '\r')
4578         {
4579           /* If the last character is CR, we can't handle it here
4580              because LF will be in the not-yet-decoded source text.
4581              Record that the CR is not yet processed.  */
4582           coding->spec.ccl.cr_carryover = 1;
4583           coding->produced--;
4584           coding->produced_char--;
4585           pend--;
4586         }
4587       while (ptr < pend)
4588         {
4589           if (*ptr == '\r')
4590             {
4591               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4592                 {
4593                   *p++ = '\n';
4594                   ptr += 2;
4595                 }
4596               else
4597                 {
4598                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4599                     goto undo_eol_conversion;
4600                   *p++ = *ptr++;
4601                 }
4602             }
4603           else if (*ptr == '\n'
4604                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4605             goto undo_eol_conversion;
4606           else
4607             *p++ = *ptr++;
4608           continue;
4609
4610         undo_eol_conversion:
4611           /* We have faced with inconsistent EOL format at PTR.
4612              Convert all LFs before PTR back to CRLFs.  */
4613           for (p--, ptr--; p >= pstart; p--)
4614             {
4615               if (*p == '\n')
4616                 *ptr-- = '\n', *ptr-- = '\r';
4617               else
4618                 *ptr-- = *p;
4619             }
4620           /*  If carryover is recorded, cancel it because we don't
4621               convert CRLF anymore.  */
4622           if (coding->spec.ccl.cr_carryover)
4623             {
4624               coding->spec.ccl.cr_carryover = 0;
4625               coding->produced++;
4626               coding->produced_char++;
4627               pend++;
4628             }
4629           p = ptr = pend;
4630           coding->eol_type = CODING_EOL_LF;
4631           coding->symbol = saved_coding_symbol;
4632         }
4633       if (p < pend)
4634         {
4635           /* As each two-byte sequence CRLF was converted to LF, (PEND
4636              - P) is the number of deleted characters.  */
4637           coding->produced -= pend - p;
4638           coding->produced_char -= pend - p;
4639         }
4640     }
4641   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4642     {
4643       unsigned char *p = ptr;
4644
4645       for (; ptr < pend; ptr++)
4646         {
4647           if (*ptr == '\r')
4648             *ptr = '\n';
4649           else if (*ptr == '\n'
4650                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4651             {
4652               for (; p < ptr; p++)
4653                 {
4654                   if (*p == '\n')
4655                     *p = '\r';
4656                 }
4657               ptr = pend;
4658               coding->eol_type = CODING_EOL_LF;
4659               coding->symbol = saved_coding_symbol;
4660             }
4661         }
4662     }
4663 }
4664
4665 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4666    decoding, it may detect coding system and format of end-of-line if
4667    those are not yet decided.  The source should be unibyte, the
4668    result is multibyte if CODING->dst_multibyte is nonzero, else
4669    unibyte.  */
4670
4671 int
4672 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4673      struct coding_system *coding;
4674      const unsigned char *source;
4675      unsigned char *destination;
4676      int src_bytes, dst_bytes;
4677 {
4678   int extra = 0;
4679
4680   if (coding->type == coding_type_undecided)
4681     detect_coding (coding, source, src_bytes);
4682
4683   if (coding->eol_type == CODING_EOL_UNDECIDED
4684       && coding->type != coding_type_ccl)
4685     {
4686       detect_eol (coding, source, src_bytes);
4687       /* We had better recover the original eol format if we
4688          encounter an inconsistent eol format while decoding.  */
4689       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4690     }
4691
4692   coding->produced = coding->produced_char = 0;
4693   coding->consumed = coding->consumed_char = 0;
4694   coding->errors = 0;
4695   coding->result = CODING_FINISH_NORMAL;
4696
4697   switch (coding->type)
4698     {
4699     case coding_type_sjis:
4700       decode_coding_sjis_big5 (coding, source, destination,
4701                                src_bytes, dst_bytes, 1);
4702       break;
4703
4704     case coding_type_iso2022:
4705       decode_coding_iso2022 (coding, source, destination,
4706                              src_bytes, dst_bytes);
4707       break;
4708
4709     case coding_type_big5:
4710       decode_coding_sjis_big5 (coding, source, destination,
4711                                src_bytes, dst_bytes, 0);
4712       break;
4713
4714     case coding_type_emacs_mule:
4715       decode_coding_emacs_mule (coding, source, destination,
4716                                 src_bytes, dst_bytes);
4717       break;
4718
4719     case coding_type_ccl:
4720       if (coding->spec.ccl.cr_carryover)
4721         {
4722           /* Put the CR which was not processed by the previous call
4723              of decode_eol_post_ccl in DESTINATION.  It will be
4724              decoded together with the following LF by the call to
4725              decode_eol_post_ccl below.  */
4726           *destination = '\r';
4727           coding->produced++;
4728           coding->produced_char++;
4729           dst_bytes--;
4730           extra = coding->spec.ccl.cr_carryover;
4731         }
4732       ccl_coding_driver (coding, source, destination + extra,
4733                          src_bytes, dst_bytes, 0);
4734       if (coding->eol_type != CODING_EOL_LF)
4735         {
4736           coding->produced += extra;
4737           coding->produced_char += extra;
4738           decode_eol_post_ccl (coding, destination, coding->produced);
4739         }
4740       break;
4741
4742     default:
4743       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4744     }
4745
4746   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4747       && coding->mode & CODING_MODE_LAST_BLOCK
4748       && coding->consumed == src_bytes)
4749     coding->result = CODING_FINISH_NORMAL;
4750
4751   if (coding->mode & CODING_MODE_LAST_BLOCK
4752       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4753     {
4754       const unsigned char *src = source + coding->consumed;
4755       unsigned char *dst = destination + coding->produced;
4756
4757       src_bytes -= coding->consumed;
4758       coding->errors++;
4759       if (COMPOSING_P (coding))
4760         DECODE_COMPOSITION_END ('1');
4761       while (src_bytes--)
4762         {
4763           int c = *src++;
4764           dst += CHAR_STRING (c, dst);
4765           coding->produced_char++;
4766         }
4767       coding->consumed = coding->consumed_char = src - source;
4768       coding->produced = dst - destination;
4769       coding->result = CODING_FINISH_NORMAL;
4770     }
4771
4772   if (!coding->dst_multibyte)
4773     {
4774       coding->produced = str_as_unibyte (destination, coding->produced);
4775       coding->produced_char = coding->produced;
4776     }
4777
4778   return coding->result;
4779 }
4780
4781 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4782    multibyteness of the source is CODING->src_multibyte, the
4783    multibyteness of the result is always unibyte.  */
4784
4785 int
4786 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4787      struct coding_system *coding;
4788      const unsigned char *source;
4789      unsigned char *destination;
4790      int src_bytes, dst_bytes;
4791 {
4792   coding->produced = coding->produced_char = 0;
4793   coding->consumed = coding->consumed_char = 0;
4794   coding->errors = 0;
4795   coding->result = CODING_FINISH_NORMAL;
4796
4797   switch (coding->type)
4798     {
4799     case coding_type_sjis:
4800       encode_coding_sjis_big5 (coding, source, destination,
4801                                src_bytes, dst_bytes, 1);
4802       break;
4803
4804     case coding_type_iso2022:
4805       encode_coding_iso2022 (coding, source, destination,
4806                              src_bytes, dst_bytes);
4807       break;
4808
4809     case coding_type_big5:
4810       encode_coding_sjis_big5 (coding, source, destination,
4811                                src_bytes, dst_bytes, 0);
4812       break;
4813
4814     case coding_type_emacs_mule:
4815       encode_coding_emacs_mule (coding, source, destination,
4816                                 src_bytes, dst_bytes);
4817       break;
4818
4819     case coding_type_ccl:
4820       ccl_coding_driver (coding, source, destination,
4821                          src_bytes, dst_bytes, 1);
4822       break;
4823
4824     default:
4825       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4826     }
4827
4828   if (coding->mode & CODING_MODE_LAST_BLOCK
4829       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4830     {
4831       const unsigned char *src = source + coding->consumed;
4832       unsigned char *dst = destination + coding->produced;
4833
4834       if (coding->type == coding_type_iso2022)
4835         ENCODE_RESET_PLANE_AND_REGISTER;
4836       if (COMPOSING_P (coding))
4837         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4838       if (coding->consumed < src_bytes)
4839         {
4840           int len = src_bytes - coding->consumed;
4841
4842           BCOPY_SHORT (src, dst, len);
4843           if (coding->src_multibyte)
4844             len = str_as_unibyte (dst, len);
4845           dst += len;
4846           coding->consumed = src_bytes;
4847         }
4848       coding->produced = coding->produced_char = dst - destination;
4849       coding->result = CODING_FINISH_NORMAL;
4850     }
4851
4852   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4853       && coding->consumed == src_bytes)
4854     coding->result = CODING_FINISH_NORMAL;
4855
4856   return coding->result;
4857 }
4858
4859 /* Scan text in the region between *BEG and *END (byte positions),
4860    skip characters which we don't have to decode by coding system
4861    CODING at the head and tail, then set *BEG and *END to the region
4862    of the text we actually have to convert.  The caller should move
4863    the gap out of the region in advance if the region is from a
4864    buffer.
4865
4866    If STR is not NULL, *BEG and *END are indices into STR.  */
4867
4868 static void
4869 shrink_decoding_region (beg, end, coding, str)
4870      int *beg, *end;
4871      struct coding_system *coding;
4872      unsigned char *str;
4873 {
4874   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4875   int eol_conversion;
4876   Lisp_Object translation_table;
4877
4878   if (coding->type == coding_type_ccl
4879       || coding->type == coding_type_undecided
4880       || coding->eol_type != CODING_EOL_LF
4881       || !NILP (coding->post_read_conversion)
4882       || coding->composing != COMPOSITION_DISABLED)
4883     {
4884       /* We can't skip any data.  */
4885       return;
4886     }
4887   if (coding->type == coding_type_no_conversion
4888       || coding->type == coding_type_raw_text
4889       || coding->type == coding_type_emacs_mule)
4890     {
4891       /* We need no conversion, but don't have to skip any data here.
4892          Decoding routine handles them effectively anyway.  */
4893       return;
4894     }
4895
4896   translation_table = coding->translation_table_for_decode;
4897   if (NILP (translation_table) && !NILP (Venable_character_translation))
4898     translation_table = Vstandard_translation_table_for_decode;
4899   if (CHAR_TABLE_P (translation_table))
4900     {
4901       int i;
4902       for (i = 0; i < 128; i++)
4903         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4904           break;
4905       if (i < 128)
4906         /* Some ASCII character should be translated.  We give up
4907            shrinking.  */
4908         return;
4909     }
4910
4911   if (coding->heading_ascii >= 0)
4912     /* Detection routine has already found how much we can skip at the
4913        head.  */
4914     *beg += coding->heading_ascii;
4915
4916   if (str)
4917     {
4918       begp_orig = begp = str + *beg;
4919       endp_orig = endp = str + *end;
4920     }
4921   else
4922     {
4923       begp_orig = begp = BYTE_POS_ADDR (*beg);
4924       endp_orig = endp = begp + *end - *beg;
4925     }
4926
4927   eol_conversion = (coding->eol_type == CODING_EOL_CR
4928                     || coding->eol_type == CODING_EOL_CRLF);
4929
4930   switch (coding->type)
4931     {
4932     case coding_type_sjis:
4933     case coding_type_big5:
4934       /* We can skip all ASCII characters at the head.  */
4935       if (coding->heading_ascii < 0)
4936         {
4937           if (eol_conversion)
4938             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4939           else
4940             while (begp < endp && *begp < 0x80) begp++;
4941         }
4942       /* We can skip all ASCII characters at the tail except for the
4943          second byte of SJIS or BIG5 code.  */
4944       if (eol_conversion)
4945         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4946       else
4947         while (begp < endp && endp[-1] < 0x80) endp--;
4948       /* Do not consider LF as ascii if preceded by CR, since that
4949          confuses eol decoding. */
4950       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4951         endp++;
4952       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4953         endp++;
4954       break;
4955
4956     case coding_type_iso2022:
4957       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4958         /* We can't skip any data.  */
4959         break;
4960       if (coding->heading_ascii < 0)
4961         {
4962           /* We can skip all ASCII characters at the head except for a
4963              few control codes.  */
4964           while (begp < endp && (c = *begp) < 0x80
4965                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4966                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4967                  && (!eol_conversion || c != ISO_CODE_LF))
4968             begp++;
4969         }
4970       switch (coding->category_idx)
4971         {
4972         case CODING_CATEGORY_IDX_ISO_8_1:
4973         case CODING_CATEGORY_IDX_ISO_8_2:
4974           /* We can skip all ASCII characters at the tail.  */
4975           if (eol_conversion)
4976             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4977           else
4978             while (begp < endp && endp[-1] < 0x80) endp--;
4979           /* Do not consider LF as ascii if preceded by CR, since that
4980              confuses eol decoding. */
4981           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4982             endp++;
4983           break;
4984
4985         case CODING_CATEGORY_IDX_ISO_7:
4986         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4987           {
4988             /* We can skip all characters at the tail except for 8-bit
4989                codes and ESC and the following 2-byte at the tail.  */
4990             unsigned char *eight_bit = NULL;
4991
4992             if (eol_conversion)
4993               while (begp < endp
4994                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4995                 {
4996                   if (!eight_bit && c & 0x80) eight_bit = endp;
4997                   endp--;
4998                 }
4999             else
5000               while (begp < endp
5001                      && (c = endp[-1]) != ISO_CODE_ESC)
5002                 {
5003                   if (!eight_bit && c & 0x80) eight_bit = endp;
5004                   endp--;
5005                 }
5006             /* Do not consider LF as ascii if preceded by CR, since that
5007                confuses eol decoding. */
5008             if (begp < endp && endp < endp_orig
5009                 && endp[-1] == '\r' && endp[0] == '\n')
5010               endp++;
5011             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5012               {
5013                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5014                   /* This is an ASCII designation sequence.  We can
5015                      surely skip the tail.  But, if we have
5016                      encountered an 8-bit code, skip only the codes
5017                      after that.  */
5018                   endp = eight_bit ? eight_bit : endp + 2;
5019                 else
5020                   /* Hmmm, we can't skip the tail.  */
5021                   endp = endp_orig;
5022               }
5023             else if (eight_bit)
5024               endp = eight_bit;
5025           }
5026         }
5027       break;
5028
5029     default:
5030       abort ();
5031     }
5032   *beg += begp - begp_orig;
5033   *end += endp - endp_orig;
5034   return;
5035 }
5036
5037 /* Like shrink_decoding_region but for encoding.  */
5038
5039 static void
5040 shrink_encoding_region (beg, end, coding, str)
5041      int *beg, *end;
5042      struct coding_system *coding;
5043      unsigned char *str;
5044 {
5045   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5046   int eol_conversion;
5047   Lisp_Object translation_table;
5048
5049   if (coding->type == coding_type_ccl
5050       || coding->eol_type == CODING_EOL_CRLF
5051       || coding->eol_type == CODING_EOL_CR
5052       || (coding->cmp_data && coding->cmp_data->used > 0))
5053     {
5054       /* We can't skip any data.  */
5055       return;
5056     }
5057   if (coding->type == coding_type_no_conversion
5058       || coding->type == coding_type_raw_text
5059       || coding->type == coding_type_emacs_mule
5060       || coding->type == coding_type_undecided)
5061     {
5062       /* We need no conversion, but don't have to skip any data here.
5063          Encoding routine handles them effectively anyway.  */
5064       return;
5065     }
5066
5067   translation_table = coding->translation_table_for_encode;
5068   if (NILP (translation_table) && !NILP (Venable_character_translation))
5069     translation_table = Vstandard_translation_table_for_encode;
5070   if (CHAR_TABLE_P (translation_table))
5071     {
5072       int i;
5073       for (i = 0; i < 128; i++)
5074         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5075           break;
5076       if (i < 128)
5077         /* Some ASCII character should be translated.  We give up
5078            shrinking.  */
5079         return;
5080     }
5081
5082   if (str)
5083     {
5084       begp_orig = begp = str + *beg;
5085       endp_orig = endp = str + *end;
5086     }
5087   else
5088     {
5089       begp_orig = begp = BYTE_POS_ADDR (*beg);
5090       endp_orig = endp = begp + *end - *beg;
5091     }
5092
5093   eol_conversion = (coding->eol_type == CODING_EOL_CR
5094                     || coding->eol_type == CODING_EOL_CRLF);
5095
5096   /* Here, we don't have to check coding->pre_write_conversion because
5097      the caller is expected to have handled it already.  */
5098   switch (coding->type)
5099     {
5100     case coding_type_iso2022:
5101       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5102         /* We can't skip any data.  */
5103         break;
5104       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5105         {
5106           unsigned char *bol = begp;
5107           while (begp < endp && *begp < 0x80)
5108             {
5109               begp++;
5110               if (begp[-1] == '\n')
5111                 bol = begp;
5112             }
5113           begp = bol;
5114           goto label_skip_tail;
5115         }
5116       /* fall down ... */
5117
5118     case coding_type_sjis:
5119     case coding_type_big5:
5120       /* We can skip all ASCII characters at the head and tail.  */
5121       if (eol_conversion)
5122         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5123       else
5124         while (begp < endp && *begp < 0x80) begp++;
5125     label_skip_tail:
5126       if (eol_conversion)
5127         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5128       else
5129         while (begp < endp && *(endp - 1) < 0x80) endp--;
5130       break;
5131
5132     default:
5133       abort ();
5134     }
5135
5136   *beg += begp - begp_orig;
5137   *end += endp - endp_orig;
5138   return;
5139 }
5140
5141 /* As shrinking conversion region requires some overhead, we don't try
5142    shrinking if the length of conversion region is less than this
5143    value.  */
5144 static int shrink_conversion_region_threshhold = 1024;
5145
5146 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5147   do {                                                                  \
5148     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5149       {                                                                 \
5150         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5151         else shrink_decoding_region (beg, end, coding, str);            \
5152       }                                                                 \
5153   } while (0)
5154
5155 static Lisp_Object
5156 code_convert_region_unwind (dummy)
5157      Lisp_Object dummy;
5158 {
5159   inhibit_pre_post_conversion = 0;
5160   return Qnil;
5161 }
5162
5163 /* Store information about all compositions in the range FROM and TO
5164    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5165    buffer or a string, defaults to the current buffer.  */
5166
5167 void
5168 coding_save_composition (coding, from, to, obj)
5169      struct coding_system *coding;
5170      int from, to;
5171      Lisp_Object obj;
5172 {
5173   Lisp_Object prop;
5174   int start, end;
5175
5176   if (coding->composing == COMPOSITION_DISABLED)
5177     return;
5178   if (!coding->cmp_data)
5179     coding_allocate_composition_data (coding, from);
5180   if (!find_composition (from, to, &start, &end, &prop, obj)
5181       || end > to)
5182     return;
5183   if (start < from
5184       && (!find_composition (end, to, &start, &end, &prop, obj)
5185           || end > to))
5186     return;
5187   coding->composing = COMPOSITION_NO;
5188   do
5189     {
5190       if (COMPOSITION_VALID_P (start, end, prop))
5191         {
5192           enum composition_method method = COMPOSITION_METHOD (prop);
5193           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5194               >= COMPOSITION_DATA_SIZE)
5195             coding_allocate_composition_data (coding, from);
5196           /* For relative composition, we remember start and end
5197              positions, for the other compositions, we also remember
5198              components.  */
5199           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5200           if (method != COMPOSITION_RELATIVE)
5201             {
5202               /* We must store a*/
5203               Lisp_Object val, ch;
5204
5205               val = COMPOSITION_COMPONENTS (prop);
5206               if (CONSP (val))
5207                 while (CONSP (val))
5208                   {
5209                     ch = XCAR (val), val = XCDR (val);
5210                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5211                   }
5212               else if (VECTORP (val) || STRINGP (val))
5213                 {
5214                   int len = (VECTORP (val)
5215                              ? XVECTOR (val)->size : SCHARS (val));
5216                   int i;
5217                   for (i = 0; i < len; i++)
5218                     {
5219                       ch = (STRINGP (val)
5220                             ? Faref (val, make_number (i))
5221                             : XVECTOR (val)->contents[i]);
5222                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5223                     }
5224                 }
5225               else              /* INTEGERP (val) */
5226                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5227             }
5228           CODING_ADD_COMPOSITION_END (coding, end - from);
5229         }
5230       start = end;
5231     }
5232   while (start < to
5233          && find_composition (start, to, &start, &end, &prop, obj)
5234          && end <= to);
5235
5236   /* Make coding->cmp_data point to the first memory block.  */
5237   while (coding->cmp_data->prev)
5238     coding->cmp_data = coding->cmp_data->prev;
5239   coding->cmp_data_start = 0;
5240 }
5241
5242 /* Reflect the saved information about compositions to OBJ.
5243    CODING->cmp_data points to a memory block for the information.  OBJ
5244    is a buffer or a string, defaults to the current buffer.  */
5245
5246 void
5247 coding_restore_composition (coding, obj)
5248      struct coding_system *coding;
5249      Lisp_Object obj;
5250 {
5251   struct composition_data *cmp_data = coding->cmp_data;
5252
5253   if (!cmp_data)
5254     return;
5255
5256   while (cmp_data->prev)
5257     cmp_data = cmp_data->prev;
5258
5259   while (cmp_data)
5260     {
5261       int i;
5262
5263       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5264            i += cmp_data->data[i])
5265         {
5266           int *data = cmp_data->data + i;
5267           enum composition_method method = (enum composition_method) data[3];
5268           Lisp_Object components;
5269
5270           if (method == COMPOSITION_RELATIVE)
5271             components = Qnil;
5272           else
5273             {
5274               int len = data[0] - 4, j;
5275               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5276
5277               for (j = 0; j < len; j++)
5278                 args[j] = make_number (data[4 + j]);
5279               components = (method == COMPOSITION_WITH_ALTCHARS
5280                             ? Fstring (len, args) : Fvector (len, args));
5281             }
5282           compose_text (data[1], data[2], components, Qnil, obj);
5283         }
5284       cmp_data = cmp_data->next;
5285     }
5286 }
5287
5288 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5289    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5290    coding system CODING, and return the status code of code conversion
5291    (currently, this value has no meaning).
5292
5293    How many characters (and bytes) are converted to how many
5294    characters (and bytes) are recorded in members of the structure
5295    CODING.
5296
5297    If REPLACE is nonzero, we do various things as if the original text
5298    is deleted and a new text is inserted.  See the comments in
5299    replace_range (insdel.c) to know what we are doing.
5300
5301    If REPLACE is zero, it is assumed that the source text is unibyte.
5302    Otherwise, it is assumed that the source text is multibyte.  */
5303
5304 int
5305 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5306      int from, from_byte, to, to_byte, encodep, replace;
5307      struct coding_system *coding;
5308 {
5309   int len = to - from, len_byte = to_byte - from_byte;
5310   int nchars_del = 0, nbytes_del = 0;
5311   int require, inserted, inserted_byte;
5312   int head_skip, tail_skip, total_skip = 0;
5313   Lisp_Object saved_coding_symbol;
5314   int first = 1;
5315   unsigned char *src, *dst;
5316   Lisp_Object deletion;
5317   int orig_point = PT, orig_len = len;
5318   int prev_Z;
5319   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5320
5321   deletion = Qnil;
5322   saved_coding_symbol = coding->symbol;
5323
5324   if (from < PT && PT < to)
5325     {
5326       TEMP_SET_PT_BOTH (from, from_byte);
5327       orig_point = from;
5328     }
5329
5330   if (replace)
5331     {
5332       int saved_from = from;
5333       int saved_inhibit_modification_hooks;
5334
5335       prepare_to_modify_buffer (from, to, &from);
5336       if (saved_from != from)
5337         {
5338           to = from + len;
5339           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5340           len_byte = to_byte - from_byte;
5341         }
5342
5343       /* The code conversion routine can not preserve text properties
5344          for now.  So, we must remove all text properties in the
5345          region.  Here, we must suppress all modification hooks.  */
5346       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5347       inhibit_modification_hooks = 1;
5348       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5349       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5350     }
5351
5352   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5353     {
5354       /* We must detect encoding of text and eol format.  */
5355
5356       if (from < GPT && to > GPT)
5357         move_gap_both (from, from_byte);
5358       if (coding->type == coding_type_undecided)
5359         {
5360           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5361           if (coding->type == coding_type_undecided)
5362             {
5363               /* It seems that the text contains only ASCII, but we
5364                  should not leave it undecided because the deeper
5365                  decoding routine (decode_coding) tries to detect the
5366                  encodings again in vain.  */
5367               coding->type = coding_type_emacs_mule;
5368               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5369               /* As emacs-mule decoder will handle composition, we
5370                  need this setting to allocate coding->cmp_data
5371                  later.  */
5372               coding->composing = COMPOSITION_NO;
5373             }
5374         }
5375       if (coding->eol_type == CODING_EOL_UNDECIDED
5376           && coding->type != coding_type_ccl)
5377         {
5378           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5379           if (coding->eol_type == CODING_EOL_UNDECIDED)
5380             coding->eol_type = CODING_EOL_LF;
5381           /* We had better recover the original eol format if we
5382              encounter an inconsistent eol format while decoding.  */
5383           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5384         }
5385     }
5386
5387   /* Now we convert the text.  */
5388
5389   /* For encoding, we must process pre-write-conversion in advance.  */
5390   if (! inhibit_pre_post_conversion
5391       && encodep
5392       && SYMBOLP (coding->pre_write_conversion)
5393       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5394     {
5395       /* The function in pre-write-conversion may put a new text in a
5396          new buffer.  */
5397       struct buffer *prev = current_buffer;
5398       Lisp_Object new;
5399
5400       record_unwind_protect (code_convert_region_unwind, Qnil);
5401       /* We should not call any more pre-write/post-read-conversion
5402          functions while this pre-write-conversion is running.  */
5403       inhibit_pre_post_conversion = 1;
5404       call2 (coding->pre_write_conversion,
5405              make_number (from), make_number (to));
5406       inhibit_pre_post_conversion = 0;
5407       /* Discard the unwind protect.  */
5408       specpdl_ptr--;
5409
5410       if (current_buffer != prev)
5411         {
5412           len = ZV - BEGV;
5413           new = Fcurrent_buffer ();
5414           set_buffer_internal_1 (prev);
5415           del_range_2 (from, from_byte, to, to_byte, 0);
5416           TEMP_SET_PT_BOTH (from, from_byte);
5417           insert_from_buffer (XBUFFER (new), 1, len, 0);
5418           Fkill_buffer (new);
5419           if (orig_point >= to)
5420             orig_point += len - orig_len;
5421           else if (orig_point > from)
5422             orig_point = from;
5423           orig_len = len;
5424           to = from + len;
5425           from_byte = CHAR_TO_BYTE (from);
5426           to_byte = CHAR_TO_BYTE (to);
5427           len_byte = to_byte - from_byte;
5428           TEMP_SET_PT_BOTH (from, from_byte);
5429         }
5430     }
5431
5432   if (replace)
5433     {
5434       if (! EQ (current_buffer->undo_list, Qt))
5435         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5436       else
5437         {
5438           nchars_del = to - from;
5439           nbytes_del = to_byte - from_byte;
5440         }
5441     }
5442
5443   if (coding->composing != COMPOSITION_DISABLED)
5444     {
5445       if (encodep)
5446         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5447       else
5448         coding_allocate_composition_data (coding, from);
5449     }
5450
5451   /* Try to skip the heading and tailing ASCIIs.  */
5452   if (coding->type != coding_type_ccl)
5453     {
5454       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5455
5456       if (from < GPT && GPT < to)
5457         move_gap_both (from, from_byte);
5458       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5459       if (from_byte == to_byte
5460           && (encodep || NILP (coding->post_read_conversion))
5461           && ! CODING_REQUIRE_FLUSHING (coding))
5462         {
5463           coding->produced = len_byte;
5464           coding->produced_char = len;
5465           if (!replace)
5466             /* We must record and adjust for this new text now.  */
5467             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5468           return 0;
5469         }
5470
5471       head_skip = from_byte - from_byte_orig;
5472       tail_skip = to_byte_orig - to_byte;
5473       total_skip = head_skip + tail_skip;
5474       from += head_skip;
5475       to -= tail_skip;
5476       len -= total_skip; len_byte -= total_skip;
5477     }
5478
5479   /* For conversion, we must put the gap before the text in addition to
5480      making the gap larger for efficient decoding.  The required gap
5481      size starts from 2000 which is the magic number used in make_gap.
5482      But, after one batch of conversion, it will be incremented if we
5483      find that it is not enough .  */
5484   require = 2000;
5485
5486   if (GAP_SIZE  < require)
5487     make_gap (require - GAP_SIZE);
5488   move_gap_both (from, from_byte);
5489
5490   inserted = inserted_byte = 0;
5491
5492   GAP_SIZE += len_byte;
5493   ZV -= len;
5494   Z -= len;
5495   ZV_BYTE -= len_byte;
5496   Z_BYTE -= len_byte;
5497
5498   if (GPT - BEG < BEG_UNCHANGED)
5499     BEG_UNCHANGED = GPT - BEG;
5500   if (Z - GPT < END_UNCHANGED)
5501     END_UNCHANGED = Z - GPT;
5502
5503   if (!encodep && coding->src_multibyte)
5504     {
5505       /* Decoding routines expects that the source text is unibyte.
5506          We must convert 8-bit characters of multibyte form to
5507          unibyte.  */
5508       int len_byte_orig = len_byte;
5509       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5510       if (len_byte < len_byte_orig)
5511         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5512                     len_byte);
5513       coding->src_multibyte = 0;
5514     }
5515
5516   for (;;)
5517     {
5518       int result;
5519
5520       /* The buffer memory is now:
5521          +--------+converted-text+---------+-------original-text-------+---+
5522          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5523                   |<---------------------- GAP ----------------------->|  */
5524       src = GAP_END_ADDR - len_byte;
5525       dst = GPT_ADDR + inserted_byte;
5526
5527       if (encodep)
5528         result = encode_coding (coding, src, dst, len_byte, 0);
5529       else
5530         {
5531           if (coding->composing != COMPOSITION_DISABLED)
5532             coding->cmp_data->char_offset = from + inserted;
5533           result = decode_coding (coding, src, dst, len_byte, 0);
5534         }
5535
5536       /* The buffer memory is now:
5537          +--------+-------converted-text----+--+------original-text----+---+
5538          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5539                   |<---------------------- GAP ----------------------->|  */
5540
5541       inserted += coding->produced_char;
5542       inserted_byte += coding->produced;
5543       len_byte -= coding->consumed;
5544
5545       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5546         {
5547           coding_allocate_composition_data (coding, from + inserted);
5548           continue;
5549         }
5550
5551       src += coding->consumed;
5552       dst += coding->produced;
5553
5554       if (result == CODING_FINISH_NORMAL)
5555         {
5556           src += len_byte;
5557           break;
5558         }
5559       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5560         {
5561           unsigned char *pend = dst, *p = pend - inserted_byte;
5562           Lisp_Object eol_type;
5563
5564           /* Encode LFs back to the original eol format (CR or CRLF).  */
5565           if (coding->eol_type == CODING_EOL_CR)
5566             {
5567               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5568             }
5569           else
5570             {
5571               int count = 0;
5572
5573               while (p < pend) if (*p++ == '\n') count++;
5574               if (src - dst < count)
5575                 {
5576                   /* We don't have sufficient room for encoding LFs
5577                      back to CRLF.  We must record converted and
5578                      not-yet-converted text back to the buffer
5579                      content, enlarge the gap, then record them out of
5580                      the buffer contents again.  */
5581                   int add = len_byte + inserted_byte;
5582
5583                   GAP_SIZE -= add;
5584                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5585                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5586                   make_gap (count - GAP_SIZE);
5587                   GAP_SIZE += add;
5588                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5589                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5590                   /* Don't forget to update SRC, DST, and PEND.  */
5591                   src = GAP_END_ADDR - len_byte;
5592                   dst = GPT_ADDR + inserted_byte;
5593                   pend = dst;
5594                 }
5595               inserted += count;
5596               inserted_byte += count;
5597               coding->produced += count;
5598               p = dst = pend + count;
5599               while (count)
5600                 {
5601                   *--p = *--pend;
5602                   if (*p == '\n') count--, *--p = '\r';
5603                 }
5604             }
5605
5606           /* Suppress eol-format conversion in the further conversion.  */
5607           coding->eol_type = CODING_EOL_LF;
5608
5609           /* Set the coding system symbol to that for Unix-like EOL.  */
5610           eol_type = Fget (saved_coding_symbol, Qeol_type);
5611           if (VECTORP (eol_type)
5612               && XVECTOR (eol_type)->size == 3
5613               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5614             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5615           else
5616             coding->symbol = saved_coding_symbol;
5617
5618           continue;
5619         }
5620       if (len_byte <= 0)
5621         {
5622           if (coding->type != coding_type_ccl
5623               || coding->mode & CODING_MODE_LAST_BLOCK)
5624             break;
5625           coding->mode |= CODING_MODE_LAST_BLOCK;
5626           continue;
5627         }
5628       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5629         {
5630           /* The source text ends in invalid codes.  Let's just
5631              make them valid buffer contents, and finish conversion.  */
5632           if (multibyte_p)
5633             {
5634               unsigned char *start = dst;
5635
5636               inserted += len_byte;
5637               while (len_byte--)
5638                 {
5639                   int c = *src++;
5640                   dst += CHAR_STRING (c, dst);
5641                 }
5642
5643               inserted_byte += dst - start;
5644             }
5645           else
5646             {
5647               inserted += len_byte;
5648               inserted_byte += len_byte;
5649               while (len_byte--)
5650                 *dst++ = *src++;
5651             }
5652           break;
5653         }
5654       if (result == CODING_FINISH_INTERRUPT)
5655         {
5656           /* The conversion procedure was interrupted by a user.  */
5657           break;
5658         }
5659       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5660       if (coding->consumed < 1)
5661         {
5662           /* It's quite strange to require more memory without
5663              consuming any bytes.  Perhaps CCL program bug.  */
5664           break;
5665         }
5666       if (first)
5667         {
5668           /* We have just done the first batch of conversion which was
5669              stopped because of insufficient gap.  Let's reconsider the
5670              required gap size (i.e. SRT - DST) now.
5671
5672              We have converted ORIG bytes (== coding->consumed) into
5673              NEW bytes (coding->produced).  To convert the remaining
5674              LEN bytes, we may need REQUIRE bytes of gap, where:
5675                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5676                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5677              Here, we are sure that NEW >= ORIG.  */
5678           float ratio;
5679
5680           if (coding->produced <= coding->consumed)
5681             {
5682               /* This happens because of CCL-based coding system with
5683                  eol-type CRLF.  */
5684               require = 0;
5685             }
5686           else
5687             {
5688               ratio = (coding->produced - coding->consumed) / coding->consumed;
5689               require = len_byte * ratio;
5690             }
5691           first = 0;
5692         }
5693       if ((src - dst) < (require + 2000))
5694         {
5695           /* See the comment above the previous call of make_gap.  */
5696           int add = len_byte + inserted_byte;
5697
5698           GAP_SIZE -= add;
5699           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5700           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5701           make_gap (require + 2000);
5702           GAP_SIZE += add;
5703           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5704           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5705         }
5706     }
5707   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5708
5709   if (encodep && coding->dst_multibyte)
5710     {
5711       /* The output is unibyte.  We must convert 8-bit characters to
5712          multibyte form.  */
5713       if (inserted_byte * 2 > GAP_SIZE)
5714         {
5715           GAP_SIZE -= inserted_byte;
5716           ZV += inserted_byte; Z += inserted_byte;
5717           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5718           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5719           make_gap (inserted_byte - GAP_SIZE);
5720           GAP_SIZE += inserted_byte;
5721           ZV -= inserted_byte; Z -= inserted_byte;
5722           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5723           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5724         }
5725       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5726     }
5727
5728   /* If we shrank the conversion area, adjust it now.  */
5729   if (total_skip > 0)
5730     {
5731       if (tail_skip > 0)
5732         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5733       inserted += total_skip; inserted_byte += total_skip;
5734       GAP_SIZE += total_skip;
5735       GPT -= head_skip; GPT_BYTE -= head_skip;
5736       ZV -= total_skip; ZV_BYTE -= total_skip;
5737       Z -= total_skip; Z_BYTE -= total_skip;
5738       from -= head_skip; from_byte -= head_skip;
5739       to += tail_skip; to_byte += tail_skip;
5740     }
5741
5742   prev_Z = Z;
5743   if (! EQ (current_buffer->undo_list, Qt))
5744     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5745   else
5746     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5747                                  inserted, inserted_byte);
5748   inserted = Z - prev_Z;
5749
5750   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5751     coding_restore_composition (coding, Fcurrent_buffer ());
5752   coding_free_composition_data (coding);
5753
5754   if (! inhibit_pre_post_conversion
5755       && ! encodep && ! NILP (coding->post_read_conversion))
5756     {
5757       Lisp_Object val;
5758
5759       if (from != PT)
5760         TEMP_SET_PT_BOTH (from, from_byte);
5761       prev_Z = Z;
5762       record_unwind_protect (code_convert_region_unwind, Qnil);
5763       /* We should not call any more pre-write/post-read-conversion
5764          functions while this post-read-conversion is running.  */
5765       inhibit_pre_post_conversion = 1;
5766       val = call1 (coding->post_read_conversion, make_number (inserted));
5767       inhibit_pre_post_conversion = 0;
5768       /* Discard the unwind protect.  */
5769       specpdl_ptr--;
5770       CHECK_NUMBER (val);
5771       inserted += Z - prev_Z;
5772     }
5773
5774   if (orig_point >= from)
5775     {
5776       if (orig_point >= from + orig_len)
5777         orig_point += inserted - orig_len;
5778       else
5779         orig_point = from;
5780       TEMP_SET_PT (orig_point);
5781     }
5782
5783   if (replace)
5784     {
5785       signal_after_change (from, to - from, inserted);
5786       update_compositions (from, from + inserted, CHECK_BORDER);
5787     }
5788
5789   {
5790     coding->consumed = to_byte - from_byte;
5791     coding->consumed_char = to - from;
5792     coding->produced = inserted_byte;
5793     coding->produced_char = inserted;
5794   }
5795
5796   return 0;
5797 }
5798
5799 Lisp_Object
5800 run_pre_post_conversion_on_str (str, coding, encodep)
5801      Lisp_Object str;
5802      struct coding_system *coding;
5803      int encodep;
5804 {
5805   int count = SPECPDL_INDEX ();
5806   struct gcpro gcpro1, gcpro2;
5807   int multibyte = STRING_MULTIBYTE (str);
5808   Lisp_Object buffer;
5809   struct buffer *buf;
5810   Lisp_Object old_deactivate_mark;
5811
5812   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5813   record_unwind_protect (code_convert_region_unwind, Qnil);
5814   /* It is not crucial to specbind this.  */
5815   old_deactivate_mark = Vdeactivate_mark;
5816   GCPRO2 (str, old_deactivate_mark);
5817
5818   buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5819   buf = XBUFFER (buffer);
5820
5821   buf->directory = current_buffer->directory;
5822   buf->read_only = Qnil;
5823   buf->filename = Qnil;
5824   buf->undo_list = Qt;
5825   buf->overlays_before = Qnil;
5826   buf->overlays_after = Qnil;
5827
5828   set_buffer_internal (buf);
5829   /* We must insert the contents of STR as is without
5830      unibyte<->multibyte conversion.  For that, we adjust the
5831      multibyteness of the working buffer to that of STR.  */
5832   Ferase_buffer ();
5833   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
5834
5835   insert_from_string (str, 0, 0,
5836                       SCHARS (str), SBYTES (str), 0);
5837   UNGCPRO;
5838   inhibit_pre_post_conversion = 1;
5839   if (encodep)
5840     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5841   else
5842     {
5843       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5844       call1 (coding->post_read_conversion, make_number (Z - BEG));
5845     }
5846   inhibit_pre_post_conversion = 0;
5847   Vdeactivate_mark = old_deactivate_mark;
5848   str = make_buffer_string (BEG, Z, 1);
5849   return unbind_to (count, str);
5850 }
5851
5852 Lisp_Object
5853 decode_coding_string (str, coding, nocopy)
5854      Lisp_Object str;
5855      struct coding_system *coding;
5856      int nocopy;
5857 {
5858   int len;
5859   struct conversion_buffer buf;
5860   int from, to_byte;
5861   Lisp_Object saved_coding_symbol;
5862   int result;
5863   int require_decoding;
5864   int shrinked_bytes = 0;
5865   Lisp_Object newstr;
5866   int consumed, consumed_char, produced, produced_char;
5867
5868   from = 0;
5869   to_byte = SBYTES (str);
5870
5871   saved_coding_symbol = coding->symbol;
5872   coding->src_multibyte = STRING_MULTIBYTE (str);
5873   coding->dst_multibyte = 1;
5874   if (CODING_REQUIRE_DETECTION (coding))
5875     {
5876       /* See the comments in code_convert_region.  */
5877       if (coding->type == coding_type_undecided)
5878         {
5879           detect_coding (coding, SDATA (str), to_byte);
5880           if (coding->type == coding_type_undecided)
5881             {
5882               coding->type = coding_type_emacs_mule;
5883               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5884               /* As emacs-mule decoder will handle composition, we
5885                  need this setting to allocate coding->cmp_data
5886                  later.  */
5887               coding->composing = COMPOSITION_NO;
5888             }
5889         }
5890       if (coding->eol_type == CODING_EOL_UNDECIDED
5891           && coding->type != coding_type_ccl)
5892         {
5893           saved_coding_symbol = coding->symbol;
5894           detect_eol (coding, SDATA (str), to_byte);
5895           if (coding->eol_type == CODING_EOL_UNDECIDED)
5896             coding->eol_type = CODING_EOL_LF;
5897           /* We had better recover the original eol format if we
5898              encounter an inconsistent eol format while decoding.  */
5899           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5900         }
5901     }
5902
5903   if (coding->type == coding_type_no_conversion
5904       || coding->type == coding_type_raw_text)
5905     coding->dst_multibyte = 0;
5906
5907   require_decoding = CODING_REQUIRE_DECODING (coding);
5908
5909   if (STRING_MULTIBYTE (str))
5910     {
5911       /* Decoding routines expect the source text to be unibyte.  */
5912       str = Fstring_as_unibyte (str);
5913       to_byte = SBYTES (str);
5914       nocopy = 1;
5915       coding->src_multibyte = 0;
5916     }
5917
5918   /* Try to skip the heading and tailing ASCIIs.  */
5919   if (require_decoding && coding->type != coding_type_ccl)
5920     {
5921       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
5922                                 0);
5923       if (from == to_byte)
5924         require_decoding = 0;
5925       shrinked_bytes = from + (SBYTES (str) - to_byte);
5926     }
5927
5928   if (!require_decoding)
5929     {
5930       coding->consumed = SBYTES (str);
5931       coding->consumed_char = SCHARS (str);
5932       if (coding->dst_multibyte)
5933         {
5934           str = Fstring_as_multibyte (str);
5935           nocopy = 1;
5936         }
5937       coding->produced = SBYTES (str);
5938       coding->produced_char = SCHARS (str);
5939       return (nocopy ? str : Fcopy_sequence (str));
5940     }
5941
5942   if (coding->composing != COMPOSITION_DISABLED)
5943     coding_allocate_composition_data (coding, from);
5944   len = decoding_buffer_size (coding, to_byte - from);
5945   allocate_conversion_buffer (buf, len);
5946
5947   consumed = consumed_char = produced = produced_char = 0;
5948   while (1)
5949     {
5950       result = decode_coding (coding, SDATA (str) + from + consumed,
5951                               buf.data + produced, to_byte - from - consumed,
5952                               buf.size - produced);
5953       consumed += coding->consumed;
5954       consumed_char += coding->consumed_char;
5955       produced += coding->produced;
5956       produced_char += coding->produced_char;
5957       if (result == CODING_FINISH_NORMAL
5958           || (result == CODING_FINISH_INSUFFICIENT_SRC
5959               && coding->consumed == 0))
5960         break;
5961       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5962         coding_allocate_composition_data (coding, from + produced_char);
5963       else if (result == CODING_FINISH_INSUFFICIENT_DST)
5964         extend_conversion_buffer (&buf);
5965       else if (result == CODING_FINISH_INCONSISTENT_EOL)
5966         {
5967           Lisp_Object eol_type;
5968
5969           /* Recover the original EOL format.  */
5970           if (coding->eol_type == CODING_EOL_CR)
5971             {
5972               unsigned char *p;
5973               for (p = buf.data; p < buf.data + produced; p++)
5974                 if (*p == '\n') *p = '\r';
5975             }
5976           else if (coding->eol_type == CODING_EOL_CRLF)
5977             {
5978               int num_eol = 0;
5979               unsigned char *p0, *p1;
5980               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
5981                 if (*p0 == '\n') num_eol++;
5982               if (produced + num_eol >= buf.size)
5983                 extend_conversion_buffer (&buf);
5984               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
5985                 {
5986                   *--p1 = *--p0;
5987                   if (*p0 == '\n') *--p1 = '\r';
5988                 }
5989               produced += num_eol;
5990               produced_char += num_eol;
5991             }
5992           /* Suppress eol-format conversion in the further conversion.  */
5993           coding->eol_type = CODING_EOL_LF;
5994
5995           /* Set the coding system symbol to that for Unix-like EOL.  */
5996           eol_type = Fget (saved_coding_symbol, Qeol_type);
5997           if (VECTORP (eol_type)
5998               && XVECTOR (eol_type)->size == 3
5999               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6000             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6001           else
6002             coding->symbol = saved_coding_symbol;
6003
6004
6005         }
6006     }
6007
6008   coding->consumed = consumed;
6009   coding->consumed_char = consumed_char;
6010   coding->produced = produced;
6011   coding->produced_char = produced_char;
6012
6013   if (coding->dst_multibyte)
6014     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6015                                            produced + shrinked_bytes);
6016   else
6017     newstr = make_uninit_string (produced + shrinked_bytes);
6018   if (from > 0)
6019     STRING_COPYIN (newstr, 0, SDATA (str), from);
6020   STRING_COPYIN (newstr, from, buf.data, produced);
6021   if (shrinked_bytes > from)
6022     STRING_COPYIN (newstr, from + produced,
6023                    SDATA (str) + to_byte,
6024                    shrinked_bytes - from);
6025   free_conversion_buffer (&buf);
6026
6027   if (coding->cmp_data && coding->cmp_data->used)
6028     coding_restore_composition (coding, newstr);
6029   coding_free_composition_data (coding);
6030
6031   if (SYMBOLP (coding->post_read_conversion)
6032       && !NILP (Ffboundp (coding->post_read_conversion)))
6033     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6034
6035   return newstr;
6036 }
6037
6038 Lisp_Object
6039 encode_coding_string (str, coding, nocopy)
6040      Lisp_Object str;
6041      struct coding_system *coding;
6042      int nocopy;
6043 {
6044   int len;
6045   struct conversion_buffer buf;
6046   int from, to, to_byte;
6047   int result;
6048   int shrinked_bytes = 0;
6049   Lisp_Object newstr;
6050   int consumed, consumed_char, produced, produced_char;
6051
6052   if (SYMBOLP (coding->pre_write_conversion)
6053       && !NILP (Ffboundp (coding->pre_write_conversion)))
6054     str = run_pre_post_conversion_on_str (str, coding, 1);
6055
6056   from = 0;
6057   to = SCHARS (str);
6058   to_byte = SBYTES (str);
6059
6060   /* Encoding routines determine the multibyteness of the source text
6061      by coding->src_multibyte.  */
6062   coding->src_multibyte = STRING_MULTIBYTE (str);
6063   coding->dst_multibyte = 0;
6064   if (! CODING_REQUIRE_ENCODING (coding))
6065     {
6066       coding->consumed = SBYTES (str);
6067       coding->consumed_char = SCHARS (str);
6068       if (STRING_MULTIBYTE (str))
6069         {
6070           str = Fstring_as_unibyte (str);
6071           nocopy = 1;
6072         }
6073       coding->produced = SBYTES (str);
6074       coding->produced_char = SCHARS (str);
6075       return (nocopy ? str : Fcopy_sequence (str));
6076     }
6077
6078   if (coding->composing != COMPOSITION_DISABLED)
6079     coding_save_composition (coding, from, to, str);
6080
6081   /* Try to skip the heading and tailing ASCIIs.  */
6082   if (coding->type != coding_type_ccl)
6083     {
6084       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6085                                 1);
6086       if (from == to_byte)
6087         return (nocopy ? str : Fcopy_sequence (str));
6088       shrinked_bytes = from + (SBYTES (str) - to_byte);
6089     }
6090
6091   len = encoding_buffer_size (coding, to_byte - from);
6092   allocate_conversion_buffer (buf, len);
6093
6094   consumed = consumed_char = produced = produced_char = 0;
6095   while (1)
6096     {
6097       result = encode_coding (coding, SDATA (str) + from + consumed,
6098                               buf.data + produced, to_byte - from - consumed,
6099                               buf.size - produced);
6100       consumed += coding->consumed;
6101       consumed_char += coding->consumed_char;
6102       produced += coding->produced;
6103       produced_char += coding->produced_char;
6104       if (result == CODING_FINISH_NORMAL
6105           || (result == CODING_FINISH_INSUFFICIENT_SRC
6106               && coding->consumed == 0))
6107         break;
6108       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6109       extend_conversion_buffer (&buf);
6110     }
6111
6112   coding->consumed = consumed;
6113   coding->consumed_char = consumed_char;
6114   coding->produced = produced;
6115   coding->produced_char = produced_char;
6116
6117   newstr = make_uninit_string (produced + shrinked_bytes);
6118   if (from > 0)
6119     STRING_COPYIN (newstr, 0, SDATA (str), from);
6120   STRING_COPYIN (newstr, from, buf.data, produced);
6121   if (shrinked_bytes > from)
6122     STRING_COPYIN (newstr, from + produced,
6123                    SDATA (str) + to_byte,
6124                    shrinked_bytes - from);
6125
6126   free_conversion_buffer (&buf);
6127   coding_free_composition_data (coding);
6128
6129   return newstr;
6130 }
6131
6132 \f
6133 #ifdef emacs
6134 /*** 8. Emacs Lisp library functions ***/
6135
6136 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6137        doc: /* Return t if OBJECT is nil or a coding-system.
6138 See the documentation of `make-coding-system' for information
6139 about coding-system objects.  */)
6140      (obj)
6141      Lisp_Object obj;
6142 {
6143   if (NILP (obj))
6144     return Qt;
6145   if (!SYMBOLP (obj))
6146     return Qnil;
6147   /* Get coding-spec vector for OBJ.  */
6148   obj = Fget (obj, Qcoding_system);
6149   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6150           ? Qt : Qnil);
6151 }
6152
6153 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6154        Sread_non_nil_coding_system, 1, 1, 0,
6155        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6156      (prompt)
6157      Lisp_Object prompt;
6158 {
6159   Lisp_Object val;
6160   do
6161     {
6162       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6163                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6164     }
6165   while (SCHARS (val) == 0);
6166   return (Fintern (val, Qnil));
6167 }
6168
6169 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6170        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6171 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6172      (prompt, default_coding_system)
6173      Lisp_Object prompt, default_coding_system;
6174 {
6175   Lisp_Object val;
6176   if (SYMBOLP (default_coding_system))
6177     default_coding_system = SYMBOL_NAME (default_coding_system);
6178   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6179                           Qt, Qnil, Qcoding_system_history,
6180                           default_coding_system, Qnil);
6181   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6182 }
6183
6184 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6185        1, 1, 0,
6186        doc: /* Check validity of CODING-SYSTEM.
6187 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6188 It is valid if it is a symbol with a non-nil `coding-system' property.
6189 The value of property should be a vector of length 5.  */)
6190      (coding_system)
6191      Lisp_Object coding_system;
6192 {
6193   CHECK_SYMBOL (coding_system);
6194   if (!NILP (Fcoding_system_p (coding_system)))
6195     return coding_system;
6196   while (1)
6197     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6198 }
6199 \f
6200 Lisp_Object
6201 detect_coding_system (src, src_bytes, highest, multibytep)
6202      const unsigned char *src;
6203      int src_bytes, highest;
6204      int multibytep;
6205 {
6206   int coding_mask, eol_type;
6207   Lisp_Object val, tmp;
6208   int dummy;
6209
6210   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6211   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6212   if (eol_type == CODING_EOL_INCONSISTENT)
6213     eol_type = CODING_EOL_UNDECIDED;
6214
6215   if (!coding_mask)
6216     {
6217       val = Qundecided;
6218       if (eol_type != CODING_EOL_UNDECIDED)
6219         {
6220           Lisp_Object val2;
6221           val2 = Fget (Qundecided, Qeol_type);
6222           if (VECTORP (val2))
6223             val = XVECTOR (val2)->contents[eol_type];
6224         }
6225       return (highest ? val : Fcons (val, Qnil));
6226     }
6227
6228   /* At first, gather possible coding systems in VAL.  */
6229   val = Qnil;
6230   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6231     {
6232       Lisp_Object category_val, category_index;
6233
6234       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6235       category_val = Fsymbol_value (XCAR (tmp));
6236       if (!NILP (category_val)
6237           && NATNUMP (category_index)
6238           && (coding_mask & (1 << XFASTINT (category_index))))
6239         {
6240           val = Fcons (category_val, val);
6241           if (highest)
6242             break;
6243         }
6244     }
6245   if (!highest)
6246     val = Fnreverse (val);
6247
6248   /* Then, replace the elements with subsidiary coding systems.  */
6249   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6250     {
6251       if (eol_type != CODING_EOL_UNDECIDED
6252           && eol_type != CODING_EOL_INCONSISTENT)
6253         {
6254           Lisp_Object eol;
6255           eol = Fget (XCAR (tmp), Qeol_type);
6256           if (VECTORP (eol))
6257             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6258         }
6259     }
6260   return (highest ? XCAR (val) : val);
6261 }
6262
6263 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6264        2, 3, 0,
6265        doc: /* Detect coding system of the text in the region between START and END.
6266 Return a list of possible coding systems ordered by priority.
6267
6268 If only ASCII characters are found, it returns a list of single element
6269 `undecided' or its subsidiary coding system according to a detected
6270 end-of-line format.
6271
6272 If optional argument HIGHEST is non-nil, return the coding system of
6273 highest priority.  */)
6274      (start, end, highest)
6275      Lisp_Object start, end, highest;
6276 {
6277   int from, to;
6278   int from_byte, to_byte;
6279   int include_anchor_byte = 0;
6280
6281   CHECK_NUMBER_COERCE_MARKER (start);
6282   CHECK_NUMBER_COERCE_MARKER (end);
6283
6284   validate_region (&start, &end);
6285   from = XINT (start), to = XINT (end);
6286   from_byte = CHAR_TO_BYTE (from);
6287   to_byte = CHAR_TO_BYTE (to);
6288
6289   if (from < GPT && to >= GPT)
6290     move_gap_both (to, to_byte);
6291   /* If we an anchor byte `\0' follows the region, we include it in
6292      the detecting source.  Then code detectors can handle the tailing
6293      byte sequence more accurately.
6294
6295      Fix me: This is not a perfect solution.  It is better that we
6296      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6297   */
6298   if (to == Z || (to == GPT && GAP_SIZE > 0))
6299     include_anchor_byte = 1;
6300   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6301                                to_byte - from_byte + include_anchor_byte,
6302                                !NILP (highest),
6303                                !NILP (current_buffer
6304                                       ->enable_multibyte_characters));
6305 }
6306
6307 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6308        1, 2, 0,
6309        doc: /* Detect coding system of the text in STRING.
6310 Return a list of possible coding systems ordered by priority.
6311
6312 If only ASCII characters are found, it returns a list of single element
6313 `undecided' or its subsidiary coding system according to a detected
6314 end-of-line format.
6315
6316 If optional argument HIGHEST is non-nil, return the coding system of
6317 highest priority.  */)
6318      (string, highest)
6319      Lisp_Object string, highest;
6320 {
6321   CHECK_STRING (string);
6322
6323   return detect_coding_system (SDATA (string),
6324                                /* "+ 1" is to include the anchor byte
6325                                   `\0'.  With this, code detectors can
6326                                   handle the tailing bytes more
6327                                   accurately.  */
6328                                SBYTES (string) + 1,
6329                                !NILP (highest),
6330                                STRING_MULTIBYTE (string));
6331 }
6332
6333 /* Return an intersection of lists L1 and L2.  */
6334
6335 static Lisp_Object
6336 intersection (l1, l2)
6337      Lisp_Object l1, l2;
6338 {
6339   Lisp_Object val = Fcons (Qnil, Qnil), tail;
6340
6341   for (tail = val; CONSP (l1); l1 = XCDR (l1))
6342     {
6343       if (!NILP (Fmemq (XCAR (l1), l2)))
6344         {
6345           XSETCDR (tail, Fcons (XCAR (l1), Qnil));
6346           tail = XCDR (tail);
6347         }
6348     }
6349   return XCDR (val);
6350 }
6351
6352
6353 /*  Subroutine for Fsafe_coding_systems_region_internal.
6354
6355     Return a list of coding systems that safely encode the multibyte
6356     text between P and PEND.  SAFE_CODINGS, if non-nil, is a list of
6357     possible coding systems.  If it is nil, it means that we have not
6358     yet found any coding systems.
6359
6360     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
6361     element of WORK_TABLE is set to t once the element is looked up.
6362
6363     If a non-ASCII single byte char is found, set
6364     *single_byte_char_found to 1.  */
6365
6366 static Lisp_Object
6367 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6368      unsigned char *p, *pend;
6369      Lisp_Object safe_codings, work_table;
6370      int *single_byte_char_found;
6371 {
6372   int c, len, idx;
6373   Lisp_Object val;
6374
6375   while (p < pend)
6376     {
6377       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6378       p += len;
6379       if (ASCII_BYTE_P (c))
6380         /* We can ignore ASCII characters here.  */
6381         continue;
6382       if (SINGLE_BYTE_CHAR_P (c))
6383         *single_byte_char_found = 1;
6384       if (NILP (safe_codings))
6385         continue;
6386       /* Check the safe coding systems for C.  */
6387       val = char_table_ref_and_index (work_table, c, &idx);
6388       if (EQ (val, Qt))
6389         /* This element was already checked.  Ignore it.  */
6390         continue;
6391       /* Remember that we checked this element.  */
6392       CHAR_TABLE_SET (work_table, make_number (idx), Qt);
6393
6394       /* If there are some safe coding systems for C and we have
6395          already found the other set of coding systems for the
6396          different characters, get the intersection of them.  */
6397       if (!EQ (safe_codings, Qt) && !NILP (val))
6398         val = intersection (safe_codings, val);
6399       safe_codings = val;
6400     }
6401   return safe_codings;
6402 }
6403
6404
6405 /* Return a list of coding systems that safely encode the text between
6406    START and END.  If the text contains only ASCII or is unibyte,
6407    return t.  */
6408
6409 DEFUN ("find-coding-systems-region-internal",
6410        Ffind_coding_systems_region_internal,
6411        Sfind_coding_systems_region_internal, 2, 2, 0,
6412        doc: /* Internal use only.  */)
6413      (start, end)
6414      Lisp_Object start, end;
6415 {
6416   Lisp_Object work_table, safe_codings;
6417   int non_ascii_p = 0;
6418   int single_byte_char_found = 0;
6419   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6420
6421   if (STRINGP (start))
6422     {
6423       if (!STRING_MULTIBYTE (start))
6424         return Qt;
6425       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6426       p2 = p2end = p1end;
6427       if (SCHARS (start) != SBYTES (start))
6428         non_ascii_p = 1;
6429     }
6430   else
6431     {
6432       int from, to, stop;
6433
6434       CHECK_NUMBER_COERCE_MARKER (start);
6435       CHECK_NUMBER_COERCE_MARKER (end);
6436       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6437         args_out_of_range (start, end);
6438       if (NILP (current_buffer->enable_multibyte_characters))
6439         return Qt;
6440       from = CHAR_TO_BYTE (XINT (start));
6441       to = CHAR_TO_BYTE (XINT (end));
6442       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6443       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6444       if (stop == to)
6445         p2 = p2end = p1end;
6446       else
6447         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6448       if (XINT (end) - XINT (start) != to - from)
6449         non_ascii_p = 1;
6450     }
6451
6452   if (!non_ascii_p)
6453     {
6454       /* We are sure that the text contains no multibyte character.
6455          Check if it contains eight-bit-graphic.  */
6456       p = p1;
6457       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6458       if (p == p1end)
6459         {
6460           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6461           if (p == p2end)
6462             return Qt;
6463         }
6464     }
6465
6466   /* The text contains non-ASCII characters.  */
6467   work_table = Fcopy_sequence (Vchar_coding_system_table);
6468   safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
6469                                     &single_byte_char_found);
6470   if (p2 < p2end)
6471     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6472                                       &single_byte_char_found);
6473
6474   if (EQ (safe_codings, Qt))
6475     ; /* Nothing to be done.  */
6476   else if (!single_byte_char_found)
6477     {
6478       /* Append generic coding systems.  */
6479       Lisp_Object args[2];
6480       args[0] = safe_codings;
6481       args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
6482                                         make_number (0));
6483       safe_codings = Fappend (2, args);
6484     }
6485   else
6486     safe_codings = Fcons (Qraw_text,
6487                           Fcons (Qemacs_mule,
6488                                  Fcons (Qno_conversion, safe_codings)));
6489   return safe_codings;
6490 }
6491
6492
6493 /* Search from position POS for such characters that are unencodable
6494    accoding to SAFE_CHARS, and return a list of their positions.  P
6495    points where in the memory the character at POS exists.  Limit the
6496    search at PEND or when Nth unencodable characters are found.
6497
6498    If SAFE_CHARS is a char table, an element for an unencodable
6499    character is nil.
6500
6501    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6502
6503    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6504    eight-bit-graphic characters are unencodable.  */
6505
6506 static Lisp_Object
6507 unencodable_char_position (safe_chars, pos, p, pend, n)
6508      Lisp_Object safe_chars;
6509      int pos;
6510      unsigned char *p, *pend;
6511      int n;
6512 {
6513   Lisp_Object pos_list;
6514
6515   pos_list = Qnil;
6516   while (p < pend)
6517     {
6518       int len;
6519       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6520
6521       if (c >= 128
6522           && (CHAR_TABLE_P (safe_chars)
6523               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6524               : (NILP (safe_chars) || c < 256)))
6525         {
6526           pos_list = Fcons (make_number (pos), pos_list);
6527           if (--n <= 0)
6528             break;
6529         }
6530       pos++;
6531       p += len;
6532     }
6533   return Fnreverse (pos_list);
6534 }
6535
6536
6537 DEFUN ("unencodable-char-position", Funencodable_char_position,
6538        Sunencodable_char_position, 3, 5, 0,
6539        doc: /*
6540 Return position of first un-encodable character in a region.
6541 START and END specfiy the region and CODING-SYSTEM specifies the
6542 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6543
6544 If optional 4th argument COUNT is non-nil, it specifies at most how
6545 many un-encodable characters to search.  In this case, the value is a
6546 list of positions.
6547
6548 If optional 5th argument STRING is non-nil, it is a string to search
6549 for un-encodable characters.  In that case, START and END are indexes
6550 to the string.  */)
6551      (start, end, coding_system, count, string)
6552      Lisp_Object start, end, coding_system, count, string;
6553 {
6554   int n;
6555   Lisp_Object safe_chars;
6556   struct coding_system coding;
6557   Lisp_Object positions;
6558   int from, to;
6559   unsigned char *p, *pend;
6560
6561   if (NILP (string))
6562     {
6563       validate_region (&start, &end);
6564       from = XINT (start);
6565       to = XINT (end);
6566       if (NILP (current_buffer->enable_multibyte_characters))
6567         return Qnil;
6568       p = CHAR_POS_ADDR (from);
6569       pend = CHAR_POS_ADDR (to);
6570     }
6571   else
6572     {
6573       CHECK_STRING (string);
6574       CHECK_NATNUM (start);
6575       CHECK_NATNUM (end);
6576       from = XINT (start);
6577       to = XINT (end);
6578       if (from > to
6579           || to > SCHARS (string))
6580         args_out_of_range_3 (string, start, end);
6581       if (! STRING_MULTIBYTE (string))
6582         return Qnil;
6583       p = SDATA (string) + string_char_to_byte (string, from);
6584       pend = SDATA (string) + string_char_to_byte (string, to);
6585     }
6586
6587   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6588
6589   if (NILP (count))
6590     n = 1;
6591   else
6592     {
6593       CHECK_NATNUM (count);
6594       n = XINT (count);
6595     }
6596
6597   if (coding.type == coding_type_no_conversion
6598       || coding.type == coding_type_raw_text)
6599     return Qnil;
6600
6601   if (coding.type == coding_type_undecided)
6602     safe_chars = Qnil;
6603   else
6604     safe_chars = coding_safe_chars (&coding);
6605
6606   if (STRINGP (string)
6607       || from >= GPT || to <= GPT)
6608     positions = unencodable_char_position (safe_chars, from, p, pend, n);
6609   else
6610     {
6611       Lisp_Object args[2];
6612
6613       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6614       n -= XINT (Flength (args[0]));
6615       if (n <= 0)
6616         positions = args[0];
6617       else
6618         {
6619           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6620                                                pend, n);
6621           positions = Fappend (2, args);
6622         }
6623     }
6624
6625   return  (NILP (count) ? Fcar (positions) : positions);
6626 }
6627
6628
6629 Lisp_Object
6630 code_convert_region1 (start, end, coding_system, encodep)
6631      Lisp_Object start, end, coding_system;
6632      int encodep;
6633 {
6634   struct coding_system coding;
6635   int from, to;
6636
6637   CHECK_NUMBER_COERCE_MARKER (start);
6638   CHECK_NUMBER_COERCE_MARKER (end);
6639   CHECK_SYMBOL (coding_system);
6640
6641   validate_region (&start, &end);
6642   from = XFASTINT (start);
6643   to = XFASTINT (end);
6644
6645   if (NILP (coding_system))
6646     return make_number (to - from);
6647
6648   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6649     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6650
6651   coding.mode |= CODING_MODE_LAST_BLOCK;
6652   coding.src_multibyte = coding.dst_multibyte
6653     = !NILP (current_buffer->enable_multibyte_characters);
6654   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6655                        &coding, encodep, 1);
6656   Vlast_coding_system_used = coding.symbol;
6657   return make_number (coding.produced_char);
6658 }
6659
6660 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6661        3, 3, "r\nzCoding system: ",
6662        doc: /* Decode the current region from the specified coding system.
6663 When called from a program, takes three arguments:
6664 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6665 This function sets `last-coding-system-used' to the precise coding system
6666 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6667 not fully specified.)
6668 It returns the length of the decoded text.  */)
6669      (start, end, coding_system)
6670      Lisp_Object start, end, coding_system;
6671 {
6672   return code_convert_region1 (start, end, coding_system, 0);
6673 }
6674
6675 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6676        3, 3, "r\nzCoding system: ",
6677        doc: /* Encode the current region into the specified coding system.
6678 When called from a program, takes three arguments:
6679 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6680 This function sets `last-coding-system-used' to the precise coding system
6681 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6682 not fully specified.)
6683 It returns the length of the encoded text.  */)
6684      (start, end, coding_system)
6685      Lisp_Object start, end, coding_system;
6686 {
6687   return code_convert_region1 (start, end, coding_system, 1);
6688 }
6689
6690 Lisp_Object
6691 code_convert_string1 (string, coding_system, nocopy, encodep)
6692      Lisp_Object string, coding_system, nocopy;
6693      int encodep;
6694 {
6695   struct coding_system coding;
6696
6697   CHECK_STRING (string);
6698   CHECK_SYMBOL (coding_system);
6699
6700   if (NILP (coding_system))
6701     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6702
6703   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6704     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6705
6706   coding.mode |= CODING_MODE_LAST_BLOCK;
6707   string = (encodep
6708             ? encode_coding_string (string, &coding, !NILP (nocopy))
6709             : decode_coding_string (string, &coding, !NILP (nocopy)));
6710   Vlast_coding_system_used = coding.symbol;
6711
6712   return string;
6713 }
6714
6715 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6716        2, 3, 0,
6717        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6718 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6719 if the decoding operation is trivial.
6720 This function sets `last-coding-system-used' to the precise coding system
6721 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6722 not fully specified.)  */)
6723      (string, coding_system, nocopy)
6724      Lisp_Object string, coding_system, nocopy;
6725 {
6726   return code_convert_string1 (string, coding_system, nocopy, 0);
6727 }
6728
6729 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6730        2, 3, 0,
6731        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6732 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6733 if the encoding operation is trivial.
6734 This function sets `last-coding-system-used' to the precise coding system
6735 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6736 not fully specified.)  */)
6737      (string, coding_system, nocopy)
6738      Lisp_Object string, coding_system, nocopy;
6739 {
6740   return code_convert_string1 (string, coding_system, nocopy, 1);
6741 }
6742
6743 /* Encode or decode STRING according to CODING_SYSTEM.
6744    Do not set Vlast_coding_system_used.
6745
6746    This function is called only from macros DECODE_FILE and
6747    ENCODE_FILE, thus we ignore character composition.  */
6748
6749 Lisp_Object
6750 code_convert_string_norecord (string, coding_system, encodep)
6751      Lisp_Object string, coding_system;
6752      int encodep;
6753 {
6754   struct coding_system coding;
6755
6756   CHECK_STRING (string);
6757   CHECK_SYMBOL (coding_system);
6758
6759   if (NILP (coding_system))
6760     return string;
6761
6762   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6763     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6764
6765   coding.composing = COMPOSITION_DISABLED;
6766   coding.mode |= CODING_MODE_LAST_BLOCK;
6767   return (encodep
6768           ? encode_coding_string (string, &coding, 1)
6769           : decode_coding_string (string, &coding, 1));
6770 }
6771 \f
6772 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6773        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6774 Return the corresponding character.  */)
6775      (code)
6776      Lisp_Object code;
6777 {
6778   unsigned char c1, c2, s1, s2;
6779   Lisp_Object val;
6780
6781   CHECK_NUMBER (code);
6782   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6783   if (s1 == 0)
6784     {
6785       if (s2 < 0x80)
6786         XSETFASTINT (val, s2);
6787       else if (s2 >= 0xA0 || s2 <= 0xDF)
6788         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
6789       else
6790         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6791     }
6792   else
6793     {
6794       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
6795           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
6796         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6797       DECODE_SJIS (s1, s2, c1, c2);
6798       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
6799     }
6800   return val;
6801 }
6802
6803 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6804        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6805 Return the corresponding code in SJIS.  */)
6806      (ch)
6807      Lisp_Object ch;
6808 {
6809   int charset, c1, c2, s1, s2;
6810   Lisp_Object val;
6811
6812   CHECK_NUMBER (ch);
6813   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6814   if (charset == CHARSET_ASCII)
6815     {
6816       val = ch;
6817     }
6818   else if (charset == charset_jisx0208
6819            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
6820     {
6821       ENCODE_SJIS (c1, c2, s1, s2);
6822       XSETFASTINT (val, (s1 << 8) | s2);
6823     }
6824   else if (charset == charset_katakana_jisx0201
6825            && c1 > 0x20 && c2 < 0xE0)
6826     {
6827       XSETFASTINT (val, c1 | 0x80);
6828     }
6829   else
6830     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
6831   return val;
6832 }
6833
6834 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6835        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6836 Return the corresponding character.  */)
6837      (code)
6838      Lisp_Object code;
6839 {
6840   int charset;
6841   unsigned char b1, b2, c1, c2;
6842   Lisp_Object val;
6843
6844   CHECK_NUMBER (code);
6845   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
6846   if (b1 == 0)
6847     {
6848       if (b2 >= 0x80)
6849         error ("Invalid BIG5 code: %x", XFASTINT (code));
6850       val = code;
6851     }
6852   else
6853     {
6854       if ((b1 < 0xA1 || b1 > 0xFE)
6855           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
6856         error ("Invalid BIG5 code: %x", XFASTINT (code));
6857       DECODE_BIG5 (b1, b2, charset, c1, c2);
6858       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
6859     }
6860   return val;
6861 }
6862
6863 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6864        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6865 Return the corresponding character code in Big5.  */)
6866      (ch)
6867      Lisp_Object ch;
6868 {
6869   int charset, c1, c2, b1, b2;
6870   Lisp_Object val;
6871
6872   CHECK_NUMBER (ch);
6873   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6874   if (charset == CHARSET_ASCII)
6875     {
6876       val = ch;
6877     }
6878   else if ((charset == charset_big5_1
6879             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
6880            || (charset == charset_big5_2
6881                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
6882     {
6883       ENCODE_BIG5 (charset, c1, c2, b1, b2);
6884       XSETFASTINT (val, (b1 << 8) | b2);
6885     }
6886   else
6887     error ("Can't encode to Big5: %d", XFASTINT (ch));
6888   return val;
6889 }
6890 \f
6891 DEFUN ("set-terminal-coding-system-internal",
6892        Fset_terminal_coding_system_internal,
6893        Sset_terminal_coding_system_internal, 1, 1, 0,
6894        doc: /* Internal use only.  */)
6895      (coding_system)
6896      Lisp_Object coding_system;
6897 {
6898   CHECK_SYMBOL (coding_system);
6899   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6900   /* We had better not send unsafe characters to terminal.  */
6901   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
6902   /* Character composition should be disabled.  */
6903   terminal_coding.composing = COMPOSITION_DISABLED;
6904   /* Error notification should be suppressed.  */
6905   terminal_coding.suppress_error = 1;
6906   terminal_coding.src_multibyte = 1;
6907   terminal_coding.dst_multibyte = 0;
6908   return Qnil;
6909 }
6910
6911 DEFUN ("set-safe-terminal-coding-system-internal",
6912        Fset_safe_terminal_coding_system_internal,
6913        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
6914        doc: /* Internal use only.  */)
6915      (coding_system)
6916      Lisp_Object coding_system;
6917 {
6918   CHECK_SYMBOL (coding_system);
6919   setup_coding_system (Fcheck_coding_system (coding_system),
6920                        &safe_terminal_coding);
6921   /* Character composition should be disabled.  */
6922   safe_terminal_coding.composing = COMPOSITION_DISABLED;
6923   /* Error notification should be suppressed.  */
6924   terminal_coding.suppress_error = 1;
6925   safe_terminal_coding.src_multibyte = 1;
6926   safe_terminal_coding.dst_multibyte = 0;
6927   return Qnil;
6928 }
6929
6930 DEFUN ("terminal-coding-system",
6931        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6932        doc: /* Return coding system specified for terminal output.  */)
6933      ()
6934 {
6935   return terminal_coding.symbol;
6936 }
6937
6938 DEFUN ("set-keyboard-coding-system-internal",
6939        Fset_keyboard_coding_system_internal,
6940        Sset_keyboard_coding_system_internal, 1, 1, 0,
6941        doc: /* Internal use only.  */)
6942      (coding_system)
6943      Lisp_Object coding_system;
6944 {
6945   CHECK_SYMBOL (coding_system);
6946   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
6947   /* Character composition should be disabled.  */
6948   keyboard_coding.composing = COMPOSITION_DISABLED;
6949   return Qnil;
6950 }
6951
6952 DEFUN ("keyboard-coding-system",
6953        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6954        doc: /* Return coding system specified for decoding keyboard input.  */)
6955      ()
6956 {
6957   return keyboard_coding.symbol;
6958 }
6959
6960 \f
6961 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6962        Sfind_operation_coding_system,  1, MANY, 0,
6963        doc: /* Choose a coding system for an operation based on the target name.
6964 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
6965 DECODING-SYSTEM is the coding system to use for decoding
6966 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
6967 for encoding (in case OPERATION does encoding).
6968
6969 The first argument OPERATION specifies an I/O primitive:
6970   For file I/O, `insert-file-contents' or `write-region'.
6971   For process I/O, `call-process', `call-process-region', or `start-process'.
6972   For network I/O, `open-network-stream'.
6973
6974 The remaining arguments should be the same arguments that were passed
6975 to the primitive.  Depending on which primitive, one of those arguments
6976 is selected as the TARGET.  For example, if OPERATION does file I/O,
6977 whichever argument specifies the file name is TARGET.
6978
6979 TARGET has a meaning which depends on OPERATION:
6980   For file I/O, TARGET is a file name.
6981   For process I/O, TARGET is a process name.
6982   For network I/O, TARGET is a service name or a port number
6983
6984 This function looks up what specified for TARGET in,
6985 `file-coding-system-alist', `process-coding-system-alist',
6986 or `network-coding-system-alist' depending on OPERATION.
6987 They may specify a coding system, a cons of coding systems,
6988 or a function symbol to call.
6989 In the last case, we call the function with one argument,
6990 which is a list of all the arguments given to this function.
6991
6992 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
6993      (nargs, args)
6994      int nargs;
6995      Lisp_Object *args;
6996 {
6997   Lisp_Object operation, target_idx, target, val;
6998   register Lisp_Object chain;
6999
7000   if (nargs < 2)
7001     error ("Too few arguments");
7002   operation = args[0];
7003   if (!SYMBOLP (operation)
7004       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7005     error ("Invalid first argument");
7006   if (nargs < 1 + XINT (target_idx))
7007     error ("Too few arguments for operation: %s",
7008            SDATA (SYMBOL_NAME (operation)));
7009   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7010      argument to write-region) is string, it must be treated as a
7011      target file name.  */
7012   if (EQ (operation, Qwrite_region)
7013       && nargs > 5
7014       && STRINGP (args[5]))
7015     target_idx = make_number (4);
7016   target = args[XINT (target_idx) + 1];
7017   if (!(STRINGP (target)
7018         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7019     error ("Invalid argument %d", XINT (target_idx) + 1);
7020
7021   chain = ((EQ (operation, Qinsert_file_contents)
7022             || EQ (operation, Qwrite_region))
7023            ? Vfile_coding_system_alist
7024            : (EQ (operation, Qopen_network_stream)
7025               ? Vnetwork_coding_system_alist
7026               : Vprocess_coding_system_alist));
7027   if (NILP (chain))
7028     return Qnil;
7029
7030   for (; CONSP (chain); chain = XCDR (chain))
7031     {
7032       Lisp_Object elt;
7033       elt = XCAR (chain);
7034
7035       if (CONSP (elt)
7036           && ((STRINGP (target)
7037                && STRINGP (XCAR (elt))
7038                && fast_string_match (XCAR (elt), target) >= 0)
7039               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7040         {
7041           val = XCDR (elt);
7042           /* Here, if VAL is both a valid coding system and a valid
7043              function symbol, we return VAL as a coding system.  */
7044           if (CONSP (val))
7045             return val;
7046           if (! SYMBOLP (val))
7047             return Qnil;
7048           if (! NILP (Fcoding_system_p (val)))
7049             return Fcons (val, val);
7050           if (! NILP (Ffboundp (val)))
7051             {
7052               val = call1 (val, Flist (nargs, args));
7053               if (CONSP (val))
7054                 return val;
7055               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7056                 return Fcons (val, val);
7057             }
7058           return Qnil;
7059         }
7060     }
7061   return Qnil;
7062 }
7063
7064 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7065        Supdate_coding_systems_internal, 0, 0, 0,
7066        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7067 When values of any coding categories are changed, you must
7068 call this function.  */)
7069      ()
7070 {
7071   int i;
7072
7073   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7074     {
7075       Lisp_Object val;
7076
7077       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7078       if (!NILP (val))
7079         {
7080           if (! coding_system_table[i])
7081             coding_system_table[i] = ((struct coding_system *)
7082                                       xmalloc (sizeof (struct coding_system)));
7083           setup_coding_system (val, coding_system_table[i]);
7084         }
7085       else if (coding_system_table[i])
7086         {
7087           xfree (coding_system_table[i]);
7088           coding_system_table[i] = NULL;
7089         }
7090     }
7091
7092   return Qnil;
7093 }
7094
7095 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7096        Sset_coding_priority_internal, 0, 0, 0,
7097        doc: /* Update internal database for the current value of `coding-category-list'.
7098 This function is internal use only.  */)
7099      ()
7100 {
7101   int i = 0, idx;
7102   Lisp_Object val;
7103
7104   val = Vcoding_category_list;
7105
7106   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7107     {
7108       if (! SYMBOLP (XCAR (val)))
7109         break;
7110       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7111       if (idx >= CODING_CATEGORY_IDX_MAX)
7112         break;
7113       coding_priorities[i++] = (1 << idx);
7114       val = XCDR (val);
7115     }
7116   /* If coding-category-list is valid and contains all coding
7117      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7118      the following code saves Emacs from crashing.  */
7119   while (i < CODING_CATEGORY_IDX_MAX)
7120     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7121
7122   return Qnil;
7123 }
7124
7125 #endif /* emacs */
7126
7127 \f
7128 /*** 9. Post-amble ***/
7129
7130 void
7131 init_coding_once ()
7132 {
7133   int i;
7134
7135   /* Emacs' internal format specific initialize routine.  */
7136   for (i = 0; i <= 0x20; i++)
7137     emacs_code_class[i] = EMACS_control_code;
7138   emacs_code_class[0x0A] = EMACS_linefeed_code;
7139   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7140   for (i = 0x21 ; i < 0x7F; i++)
7141     emacs_code_class[i] = EMACS_ascii_code;
7142   emacs_code_class[0x7F] = EMACS_control_code;
7143   for (i = 0x80; i < 0xFF; i++)
7144     emacs_code_class[i] = EMACS_invalid_code;
7145   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7146   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7147   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7148   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7149
7150   /* ISO2022 specific initialize routine.  */
7151   for (i = 0; i < 0x20; i++)
7152     iso_code_class[i] = ISO_control_0;
7153   for (i = 0x21; i < 0x7F; i++)
7154     iso_code_class[i] = ISO_graphic_plane_0;
7155   for (i = 0x80; i < 0xA0; i++)
7156     iso_code_class[i] = ISO_control_1;
7157   for (i = 0xA1; i < 0xFF; i++)
7158     iso_code_class[i] = ISO_graphic_plane_1;
7159   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7160   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7161   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7162   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7163   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7164   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7165   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7166   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7167   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7168   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7169
7170   setup_coding_system (Qnil, &keyboard_coding);
7171   setup_coding_system (Qnil, &terminal_coding);
7172   setup_coding_system (Qnil, &safe_terminal_coding);
7173   setup_coding_system (Qnil, &default_buffer_file_coding);
7174
7175   bzero (coding_system_table, sizeof coding_system_table);
7176
7177   bzero (ascii_skip_code, sizeof ascii_skip_code);
7178   for (i = 0; i < 128; i++)
7179     ascii_skip_code[i] = 1;
7180
7181 #if defined (MSDOS) || defined (WINDOWSNT)
7182   system_eol_type = CODING_EOL_CRLF;
7183 #else
7184   system_eol_type = CODING_EOL_LF;
7185 #endif
7186
7187   inhibit_pre_post_conversion = 0;
7188 }
7189
7190 #ifdef emacs
7191
7192 void
7193 syms_of_coding ()
7194 {
7195   Qtarget_idx = intern ("target-idx");
7196   staticpro (&Qtarget_idx);
7197
7198   Qcoding_system_history = intern ("coding-system-history");
7199   staticpro (&Qcoding_system_history);
7200   Fset (Qcoding_system_history, Qnil);
7201
7202   /* Target FILENAME is the first argument.  */
7203   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7204   /* Target FILENAME is the third argument.  */
7205   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7206
7207   Qcall_process = intern ("call-process");
7208   staticpro (&Qcall_process);
7209   /* Target PROGRAM is the first argument.  */
7210   Fput (Qcall_process, Qtarget_idx, make_number (0));
7211
7212   Qcall_process_region = intern ("call-process-region");
7213   staticpro (&Qcall_process_region);
7214   /* Target PROGRAM is the third argument.  */
7215   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7216
7217   Qstart_process = intern ("start-process");
7218   staticpro (&Qstart_process);
7219   /* Target PROGRAM is the third argument.  */
7220   Fput (Qstart_process, Qtarget_idx, make_number (2));
7221
7222   Qopen_network_stream = intern ("open-network-stream");
7223   staticpro (&Qopen_network_stream);
7224   /* Target SERVICE is the fourth argument.  */
7225   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7226
7227   Qcoding_system = intern ("coding-system");
7228   staticpro (&Qcoding_system);
7229
7230   Qeol_type = intern ("eol-type");
7231   staticpro (&Qeol_type);
7232
7233   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7234   staticpro (&Qbuffer_file_coding_system);
7235
7236   Qpost_read_conversion = intern ("post-read-conversion");
7237   staticpro (&Qpost_read_conversion);
7238
7239   Qpre_write_conversion = intern ("pre-write-conversion");
7240   staticpro (&Qpre_write_conversion);
7241
7242   Qno_conversion = intern ("no-conversion");
7243   staticpro (&Qno_conversion);
7244
7245   Qundecided = intern ("undecided");
7246   staticpro (&Qundecided);
7247
7248   Qcoding_system_p = intern ("coding-system-p");
7249   staticpro (&Qcoding_system_p);
7250
7251   Qcoding_system_error = intern ("coding-system-error");
7252   staticpro (&Qcoding_system_error);
7253
7254   Fput (Qcoding_system_error, Qerror_conditions,
7255         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7256   Fput (Qcoding_system_error, Qerror_message,
7257         build_string ("Invalid coding system"));
7258
7259   Qcoding_category = intern ("coding-category");
7260   staticpro (&Qcoding_category);
7261   Qcoding_category_index = intern ("coding-category-index");
7262   staticpro (&Qcoding_category_index);
7263
7264   Vcoding_category_table
7265     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7266   staticpro (&Vcoding_category_table);
7267   {
7268     int i;
7269     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7270       {
7271         XVECTOR (Vcoding_category_table)->contents[i]
7272           = intern (coding_category_name[i]);
7273         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7274               Qcoding_category_index, make_number (i));
7275       }
7276   }
7277
7278   Qtranslation_table = intern ("translation-table");
7279   staticpro (&Qtranslation_table);
7280   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7281
7282   Qtranslation_table_id = intern ("translation-table-id");
7283   staticpro (&Qtranslation_table_id);
7284
7285   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7286   staticpro (&Qtranslation_table_for_decode);
7287
7288   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7289   staticpro (&Qtranslation_table_for_encode);
7290
7291   Qsafe_chars = intern ("safe-chars");
7292   staticpro (&Qsafe_chars);
7293
7294   Qchar_coding_system = intern ("char-coding-system");
7295   staticpro (&Qchar_coding_system);
7296
7297   /* Intern this now in case it isn't already done.
7298      Setting this variable twice is harmless.
7299      But don't staticpro it here--that is done in alloc.c.  */
7300   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7301   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7302   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
7303
7304   Qvalid_codes = intern ("valid-codes");
7305   staticpro (&Qvalid_codes);
7306
7307   Qemacs_mule = intern ("emacs-mule");
7308   staticpro (&Qemacs_mule);
7309
7310   Qraw_text = intern ("raw-text");
7311   staticpro (&Qraw_text);
7312
7313   defsubr (&Scoding_system_p);
7314   defsubr (&Sread_coding_system);
7315   defsubr (&Sread_non_nil_coding_system);
7316   defsubr (&Scheck_coding_system);
7317   defsubr (&Sdetect_coding_region);
7318   defsubr (&Sdetect_coding_string);
7319   defsubr (&Sfind_coding_systems_region_internal);
7320   defsubr (&Sunencodable_char_position);
7321   defsubr (&Sdecode_coding_region);
7322   defsubr (&Sencode_coding_region);
7323   defsubr (&Sdecode_coding_string);
7324   defsubr (&Sencode_coding_string);
7325   defsubr (&Sdecode_sjis_char);
7326   defsubr (&Sencode_sjis_char);
7327   defsubr (&Sdecode_big5_char);
7328   defsubr (&Sencode_big5_char);
7329   defsubr (&Sset_terminal_coding_system_internal);
7330   defsubr (&Sset_safe_terminal_coding_system_internal);
7331   defsubr (&Sterminal_coding_system);
7332   defsubr (&Sset_keyboard_coding_system_internal);
7333   defsubr (&Skeyboard_coding_system);
7334   defsubr (&Sfind_operation_coding_system);
7335   defsubr (&Supdate_coding_systems_internal);
7336   defsubr (&Sset_coding_priority_internal);
7337
7338   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7339                doc: /* List of coding systems.
7340
7341 Do not alter the value of this variable manually.  This variable should be
7342 updated by the functions `make-coding-system' and
7343 `define-coding-system-alias'.  */);
7344   Vcoding_system_list = Qnil;
7345
7346   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7347                doc: /* Alist of coding system names.
7348 Each element is one element list of coding system name.
7349 This variable is given to `completing-read' as TABLE argument.
7350
7351 Do not alter the value of this variable manually.  This variable should be
7352 updated by the functions `make-coding-system' and
7353 `define-coding-system-alias'.  */);
7354   Vcoding_system_alist = Qnil;
7355
7356   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7357                doc: /* List of coding-categories (symbols) ordered by priority.
7358
7359 On detecting a coding system, Emacs tries code detection algorithms
7360 associated with each coding-category one by one in this order.  When
7361 one algorithm agrees with a byte sequence of source text, the coding
7362 system bound to the corresponding coding-category is selected.  */);
7363   {
7364     int i;
7365
7366     Vcoding_category_list = Qnil;
7367     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7368       Vcoding_category_list
7369         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7370                  Vcoding_category_list);
7371   }
7372
7373   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7374                doc: /* Specify the coding system for read operations.
7375 It is useful to bind this variable with `let', but do not set it globally.
7376 If the value is a coding system, it is used for decoding on read operation.
7377 If not, an appropriate element is used from one of the coding system alists:
7378 There are three such tables, `file-coding-system-alist',
7379 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7380   Vcoding_system_for_read = Qnil;
7381
7382   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7383                doc: /* Specify the coding system for write operations.
7384 Programs bind this variable with `let', but you should not set it globally.
7385 If the value is a coding system, it is used for encoding of output,
7386 when writing it to a file and when sending it to a file or subprocess.
7387
7388 If this does not specify a coding system, an appropriate element
7389 is used from one of the coding system alists:
7390 There are three such tables, `file-coding-system-alist',
7391 `process-coding-system-alist', and `network-coding-system-alist'.
7392 For output to files, if the above procedure does not specify a coding system,
7393 the value of `buffer-file-coding-system' is used.  */);
7394   Vcoding_system_for_write = Qnil;
7395
7396   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7397                doc: /* Coding system used in the latest file or process I/O.  */);
7398   Vlast_coding_system_used = Qnil;
7399
7400   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7401                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7402 See info node `Coding Systems' and info node `Text and Binary' concerning
7403 such conversion.  */);
7404   inhibit_eol_conversion = 0;
7405
7406   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7407                doc: /* Non-nil means process buffer inherits coding system of process output.
7408 Bind it to t if the process output is to be treated as if it were a file
7409 read from some filesystem.  */);
7410   inherit_process_coding_system = 0;
7411
7412   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7413                doc: /* Alist to decide a coding system to use for a file I/O operation.
7414 The format is ((PATTERN . VAL) ...),
7415 where PATTERN is a regular expression matching a file name,
7416 VAL is a coding system, a cons of coding systems, or a function symbol.
7417 If VAL is a coding system, it is used for both decoding and encoding
7418 the file contents.
7419 If VAL is a cons of coding systems, the car part is used for decoding,
7420 and the cdr part is used for encoding.
7421 If VAL is a function symbol, the function must return a coding system
7422 or a cons of coding systems which are used as above.  The function gets
7423 the arguments with which `find-operation-coding-system' was called.
7424
7425 See also the function `find-operation-coding-system'
7426 and the variable `auto-coding-alist'.  */);
7427   Vfile_coding_system_alist = Qnil;
7428
7429   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7430     doc: /* Alist to decide a coding system to use for a process I/O operation.
7431 The format is ((PATTERN . VAL) ...),
7432 where PATTERN is a regular expression matching a program name,
7433 VAL is a coding system, a cons of coding systems, or a function symbol.
7434 If VAL is a coding system, it is used for both decoding what received
7435 from the program and encoding what sent to the program.
7436 If VAL is a cons of coding systems, the car part is used for decoding,
7437 and the cdr part is used for encoding.
7438 If VAL is a function symbol, the function must return a coding system
7439 or a cons of coding systems which are used as above.
7440
7441 See also the function `find-operation-coding-system'.  */);
7442   Vprocess_coding_system_alist = Qnil;
7443
7444   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7445     doc: /* Alist to decide a coding system to use for a network I/O operation.
7446 The format is ((PATTERN . VAL) ...),
7447 where PATTERN is a regular expression matching a network service name
7448 or is a port number to connect to,
7449 VAL is a coding system, a cons of coding systems, or a function symbol.
7450 If VAL is a coding system, it is used for both decoding what received
7451 from the network stream and encoding what sent to the network stream.
7452 If VAL is a cons of coding systems, the car part is used for decoding,
7453 and the cdr part is used for encoding.
7454 If VAL is a function symbol, the function must return a coding system
7455 or a cons of coding systems which are used as above.
7456
7457 See also the function `find-operation-coding-system'.  */);
7458   Vnetwork_coding_system_alist = Qnil;
7459
7460   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7461                doc: /* Coding system to use with system messages.
7462 Also used for decoding keyboard input on X Window system.  */);
7463   Vlocale_coding_system = Qnil;
7464
7465   /* The eol mnemonics are reset in startup.el system-dependently.  */
7466   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7467                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7468   eol_mnemonic_unix = build_string (":");
7469
7470   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7471                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7472   eol_mnemonic_dos = build_string ("\\");
7473
7474   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7475                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7476   eol_mnemonic_mac = build_string ("/");
7477
7478   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7479                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7480   eol_mnemonic_undecided = build_string (":");
7481
7482   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7483                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7484   Venable_character_translation = Qt;
7485
7486   DEFVAR_LISP ("standard-translation-table-for-decode",
7487                &Vstandard_translation_table_for_decode,
7488                doc: /* Table for translating characters while decoding.  */);
7489   Vstandard_translation_table_for_decode = Qnil;
7490
7491   DEFVAR_LISP ("standard-translation-table-for-encode",
7492                &Vstandard_translation_table_for_encode,
7493                doc: /* Table for translating characters while encoding.  */);
7494   Vstandard_translation_table_for_encode = Qnil;
7495
7496   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7497                doc: /* Alist of charsets vs revision numbers.
7498 While encoding, if a charset (car part of an element) is found,
7499 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7500   Vcharset_revision_alist = Qnil;
7501
7502   DEFVAR_LISP ("default-process-coding-system",
7503                &Vdefault_process_coding_system,
7504                doc: /* Cons of coding systems used for process I/O by default.
7505 The car part is used for decoding a process output,
7506 the cdr part is used for encoding a text to be sent to a process.  */);
7507   Vdefault_process_coding_system = Qnil;
7508
7509   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7510                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7511 This is a vector of length 256.
7512 If Nth element is non-nil, the existence of code N in a file
7513 \(or output of subprocess) doesn't prevent it to be detected as
7514 a coding system of ISO 2022 variant which has a flag
7515 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7516 or reading output of a subprocess.
7517 Only 128th through 159th elements has a meaning.  */);
7518   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7519
7520   DEFVAR_LISP ("select-safe-coding-system-function",
7521                &Vselect_safe_coding_system_function,
7522                doc: /* Function to call to select safe coding system for encoding a text.
7523
7524 If set, this function is called to force a user to select a proper
7525 coding system which can encode the text in the case that a default
7526 coding system used in each operation can't encode the text.
7527
7528 The default value is `select-safe-coding-system' (which see).  */);
7529   Vselect_safe_coding_system_function = Qnil;
7530
7531   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
7532                doc: /* Char-table containing safe coding systems of each characters.
7533 Each element doesn't include such generic coding systems that can
7534 encode any characters.  They are in the first extra slot.  */);
7535   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
7536
7537   DEFVAR_BOOL ("inhibit-iso-escape-detection",
7538                &inhibit_iso_escape_detection,
7539                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7540
7541 By default, on reading a file, Emacs tries to detect how the text is
7542 encoded.  This code detection is sensitive to escape sequences.  If
7543 the sequence is valid as ISO2022, the code is determined as one of
7544 the ISO2022 encodings, and the file is decoded by the corresponding
7545 coding system (e.g. `iso-2022-7bit').
7546
7547 However, there may be a case that you want to read escape sequences in
7548 a file as is.  In such a case, you can set this variable to non-nil.
7549 Then, as the code detection ignores any escape sequences, no file is
7550 detected as encoded in some ISO2022 encoding.  The result is that all
7551 escape sequences become visible in a buffer.
7552
7553 The default value is nil, and it is strongly recommended not to change
7554 it.  That is because many Emacs Lisp source files that contain
7555 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7556 in Emacs's distribution, and they won't be decoded correctly on
7557 reading if you suppress escape sequence detection.
7558
7559 The other way to read escape sequences in a file without decoding is
7560 to explicitly specify some coding system that doesn't use ISO2022's
7561 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
7562   inhibit_iso_escape_detection = 0;
7563 }
7564
7565 char *
7566 emacs_strerror (error_number)
7567      int error_number;
7568 {
7569   char *str;
7570
7571   synchronize_system_messages_locale ();
7572   str = strerror (error_number);
7573
7574   if (! NILP (Vlocale_coding_system))
7575     {
7576       Lisp_Object dec = code_convert_string_norecord (build_string (str),
7577                                                       Vlocale_coding_system,
7578                                                       0);
7579       str = (char *) SDATA (dec);
7580     }
7581
7582   return str;
7583 }
7584
7585 #endif /* emacs */
7586