src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001 Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 158
 159   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 160   internal multibyte format to CODING.  The resulting unibyte text
 161   goes to a place pointed to by DESTINATION, the length of which
 162   should not exceed DST_BYTES.
 163
 164   These functions set the information about original and encoded texts
 165   in the members `produced', `produced_char', `consumed', and
 166   `consumed_char' of the structure *CODING.  They also set the member
 167   `result' to one of CODING_FINISH_XXX indicating how the encoding
 168   finished.
 169
 170   DST_BYTES zero means that the source area and destination area are
 171   overlapped, which means that we can produce encoded text until it
 172   reaches at the head of the not-yet-encoded source text.
 173
 174   Below is a template for these functions.  */
 175 #if 0
 176 static void
 177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 178      struct coding_system *coding;
 179      unsigned char *source, *destination;
 180      int src_bytes, dst_bytes;
 181 {
 182   ...
 183 }
 184 #endif
 185
 186 /*** COMMONLY USED MACROS ***/
 187
 188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 189    get one, two, and three bytes from the source text respectively.
 190    If there are not enough bytes in the source, they jump to
 191    `label_end_of_loop'.  The caller should set variables `coding',
 192    `src' and `src_end' to appropriate pointer in advance.  These
 193    macros are called from decoding routines `decode_coding_XXX', thus
 194    it is assumed that the source text is unibyte.  */
 195
 196 #define ONE_MORE_BYTE(c1)                                       \
 197   do {                                                          \
 198     if (src >= src_end)                                         \
 199       {                                                         \
 200         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 201         goto label_end_of_loop;                                 \
 202       }                                                         \
 203     c1 = *src++;                                                \
 204   } while (0)
 205
 206 #define TWO_MORE_BYTES(c1, c2)                                  \
 207   do {                                                          \
 208     if (src + 1 >= src_end)                                     \
 209       {                                                         \
 210         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 211         goto label_end_of_loop;                                 \
 212       }                                                         \
 213     c1 = *src++;                                                \
 214     c2 = *src++;                                                \
 215   } while (0)
 216
 217
 218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 219    form if MULTIBYTEP is nonzero.  */
 220
 221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 222   do {                                                          \
 223     if (src >= src_end)                                         \
 224       {                                                         \
 225         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 226         goto label_end_of_loop;                                 \
 227       }                                                         \
 228     c1 = *src++;                                                \
 229     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 230       c1 = *src++ - 0x20;                                       \
 231   } while (0)
 232
 233 /* Set C to the next character at the source text pointed by `src'.
 234    If there are not enough characters in the source, jump to
 235    `label_end_of_loop'.  The caller should set variables `coding'
 236    `src', `src_end', and `translation_table' to appropriate pointers
 237    in advance.  This macro is used in encoding routines
 238    `encode_coding_XXX', thus it assumes that the source text is in
 239    multibyte form except for 8-bit characters.  8-bit characters are
 240    in multibyte form if coding->src_multibyte is nonzero, else they
 241    are represented by a single byte.  */
 242
 243 #define ONE_MORE_CHAR(c)                                        \
 244   do {                                                          \
 245     int len = src_end - src;                                    \
 246     int bytes;                                                  \
 247     if (len <= 0)                                               \
 248       {                                                         \
 249         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 250         goto label_end_of_loop;                                 \
 251       }                                                         \
 252     if (coding->src_multibyte                                   \
 253         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 254       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 255     else                                                        \
 256       c = *src, bytes = 1;                                      \
 257     if (!NILP (translation_table))                              \
 258       c = translate_char (translation_table, c, -1, 0, 0);      \
 259     src += bytes;                                               \
 260   } while (0)
 261
 262
 263 /* Produce a multibyte form of character C to `dst'.  Jump to
 264    `label_end_of_loop' if there's not enough space at `dst'.
 265
 266    If we are now in the middle of a composition sequence, the decoded
 267    character may be ALTCHAR (for the current composition).  In that
 268    case, the character goes to coding->cmp_data->data instead of
 269    `dst'.
 270
 271    This macro is used in decoding routines.  */
 272
 273 #define EMIT_CHAR(c)                                                    \
 274   do {                                                                  \
 275     if (! COMPOSING_P (coding)                                          \
 276         || coding->composing == COMPOSITION_RELATIVE                    \
 277         || coding->composing == COMPOSITION_WITH_RULE)                  \
 278       {                                                                 \
 279         int bytes = CHAR_BYTES (c);                                     \
 280         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 281           {                                                             \
 282             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 283             goto label_end_of_loop;                                     \
 284           }                                                             \
 285         dst += CHAR_STRING (c, dst);                                    \
 286         coding->produced_char++;                                        \
 287       }                                                                 \
 288                                                                         \
 289     if (COMPOSING_P (coding)                                            \
 290         && coding->composing != COMPOSITION_RELATIVE)                   \
 291       {                                                                 \
 292         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 293         coding->composition_rule_follows                                \
 294           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 295       }                                                                 \
 296   } while (0)
 297
 298
 299 #define EMIT_ONE_BYTE(c)                                        \
 300   do {                                                          \
 301     if (dst >= (dst_bytes ? dst_end : src))                     \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     *dst++ = c;                                                 \
 307   } while (0)
 308
 309 #define EMIT_TWO_BYTES(c1, c2)                                  \
 310   do {                                                          \
 311     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 312       {                                                         \
 313         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 314         goto label_end_of_loop;                                 \
 315       }                                                         \
 316     *dst++ = c1, *dst++ = c2;                                   \
 317   } while (0)
 318
 319 #define EMIT_BYTES(from, to)                                    \
 320   do {                                                          \
 321     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 322       {                                                         \
 323         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 324         goto label_end_of_loop;                                 \
 325       }                                                         \
 326     while (from < to)                                           \
 327       *dst++ = *from++;                                         \
 328   } while (0)
 329
 330 \f
 331 /*** 1. Preamble ***/
 332
 333 #ifdef emacs
 334 #include <config.h>
 335 #endif
 336
 337 #include <stdio.h>
 338
 339 #ifdef emacs
 340
 341 #include "lisp.h"
 342 #include "buffer.h"
 343 #include "charset.h"
 344 #include "composite.h"
 345 #include "ccl.h"
 346 #include "coding.h"
 347 #include "window.h"
 348
 349 #else  /* not emacs */
 350
 351 #include "mulelib.h"
 352
 353 #endif /* not emacs */
 354
 355 Lisp_Object Qcoding_system, Qeol_type;
 356 Lisp_Object Qbuffer_file_coding_system;
 357 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 358 Lisp_Object Qno_conversion, Qundecided;
 359 Lisp_Object Qcoding_system_history;
 360 Lisp_Object Qsafe_chars;
 361 Lisp_Object Qvalid_codes;
 362
 363 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 364 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 365 Lisp_Object Qstart_process, Qopen_network_stream;
 366 Lisp_Object Qtarget_idx;
 367
 368 Lisp_Object Vselect_safe_coding_system_function;
 369
 370 /* Mnemonic string for each format of end-of-line.  */
 371 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 372 /* Mnemonic string to indicate format of end-of-line is not yet
 373    decided.  */
 374 Lisp_Object eol_mnemonic_undecided;
 375
 376 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 377    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 378 int system_eol_type;
 379
 380 #ifdef emacs
 381
 382 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 383
 384 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 385
 386 /* Coding system emacs-mule and raw-text are for converting only
 387    end-of-line format.  */
 388 Lisp_Object Qemacs_mule, Qraw_text;
 389
 390 /* Coding-systems are handed between Emacs Lisp programs and C internal
 391    routines by the following three variables.  */
 392 /* Coding-system for reading files and receiving data from process.  */
 393 Lisp_Object Vcoding_system_for_read;
 394 /* Coding-system for writing files and sending data to process.  */
 395 Lisp_Object Vcoding_system_for_write;
 396 /* Coding-system actually used in the latest I/O.  */
 397 Lisp_Object Vlast_coding_system_used;
 398
 399 /* A vector of length 256 which contains information about special
 400    Latin codes (especially for dealing with Microsoft codes).  */
 401 Lisp_Object Vlatin_extra_code_table;
 402
 403 /* Flag to inhibit code conversion of end-of-line format.  */
 404 int inhibit_eol_conversion;
 405
 406 /* Flag to inhibit ISO2022 escape sequence detection.  */
 407 int inhibit_iso_escape_detection;
 408
 409 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 410 int inherit_process_coding_system;
 411
 412 /* Coding system to be used to encode text for terminal display.  */
 413 struct coding_system terminal_coding;
 414
 415 /* Coding system to be used to encode text for terminal display when
 416    terminal coding system is nil.  */
 417 struct coding_system safe_terminal_coding;
 418
 419 /* Coding system of what is sent from terminal keyboard.  */
 420 struct coding_system keyboard_coding;
 421
 422 /* Default coding system to be used to write a file.  */
 423 struct coding_system default_buffer_file_coding;
 424
 425 Lisp_Object Vfile_coding_system_alist;
 426 Lisp_Object Vprocess_coding_system_alist;
 427 Lisp_Object Vnetwork_coding_system_alist;
 428
 429 Lisp_Object Vlocale_coding_system;
 430
 431 #endif /* emacs */
 432
 433 Lisp_Object Qcoding_category, Qcoding_category_index;
 434
 435 /* List of symbols `coding-category-xxx' ordered by priority.  */
 436 Lisp_Object Vcoding_category_list;
 437
 438 /* Table of coding categories (Lisp symbols).  */
 439 Lisp_Object Vcoding_category_table;
 440
 441 /* Table of names of symbol for each coding-category.  */
 442 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 443   "coding-category-emacs-mule",
 444   "coding-category-sjis",
 445   "coding-category-iso-7",
 446   "coding-category-iso-7-tight",
 447   "coding-category-iso-8-1",
 448   "coding-category-iso-8-2",
 449   "coding-category-iso-7-else",
 450   "coding-category-iso-8-else",
 451   "coding-category-ccl",
 452   "coding-category-big5",
 453   "coding-category-utf-8",
 454   "coding-category-utf-16-be",
 455   "coding-category-utf-16-le",
 456   "coding-category-raw-text",
 457   "coding-category-binary"
 458 };
 459
 460 /* Table of pointers to coding systems corresponding to each coding
 461    categories.  */
 462 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 463
 464 /* Table of coding category masks.  Nth element is a mask for a coding
 465    category of which priority is Nth.  */
 466 static
 467 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 468
 469 /* Flag to tell if we look up translation table on character code
 470    conversion.  */
 471 Lisp_Object Venable_character_translation;
 472 /* Standard translation table to look up on decoding (reading).  */
 473 Lisp_Object Vstandard_translation_table_for_decode;
 474 /* Standard translation table to look up on encoding (writing).  */
 475 Lisp_Object Vstandard_translation_table_for_encode;
 476
 477 Lisp_Object Qtranslation_table;
 478 Lisp_Object Qtranslation_table_id;
 479 Lisp_Object Qtranslation_table_for_decode;
 480 Lisp_Object Qtranslation_table_for_encode;
 481
 482 /* Alist of charsets vs revision number.  */
 483 Lisp_Object Vcharset_revision_alist;
 484
 485 /* Default coding systems used for process I/O.  */
 486 Lisp_Object Vdefault_process_coding_system;
 487
 488 /* Global flag to tell that we can't call post-read-conversion and
 489    pre-write-conversion functions.  Usually the value is zero, but it
 490    is set to 1 temporarily while such functions are running.  This is
 491    to avoid infinite recursive call.  */
 492 static int inhibit_pre_post_conversion;
 493
 494 /* Char-table containing safe coding systems of each character.  */
 495 Lisp_Object Vchar_coding_system_table;
 496 Lisp_Object Qchar_coding_system;
 497
 498 /* Return `safe-chars' property of coding system CODING.  Don't check
 499    validity of CODING.  */
 500
 501 Lisp_Object
 502 coding_safe_chars (coding)
 503      struct coding_system *coding;
 504 {
 505   Lisp_Object coding_spec, plist, safe_chars;
 506
 507   coding_spec = Fget (coding->symbol, Qcoding_system);
 508   plist = XVECTOR (coding_spec)->contents[3];
 509   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 510   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 511 }
 512
 513 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 514   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 515
 516 \f
 517 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 518
 519 /* Emacs' internal format for representation of multiple character
 520    sets is a kind of multi-byte encoding, i.e. characters are
 521    represented by variable-length sequences of one-byte codes.
 522
 523    ASCII characters and control characters (e.g. `tab', `newline') are
 524    represented by one-byte sequences which are their ASCII codes, in
 525    the range 0x00 through 0x7F.
 526
 527    8-bit characters of the range 0x80..0x9F are represented by
 528    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 529    code + 0x20).
 530
 531    8-bit characters of the range 0xA0..0xFF are represented by
 532    one-byte sequences which are their 8-bit code.
 533
 534    The other characters are represented by a sequence of `base
 535    leading-code', optional `extended leading-code', and one or two
 536    `position-code's.  The length of the sequence is determined by the
 537    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 538    whereas extended leading-code and position-code take the range 0xA0
 539    through 0xFF.  See `charset.h' for more details about leading-code
 540    and position-code.
 541
 542    --- CODE RANGE of Emacs' internal format ---
 543    character set        range
 544    -------------        -----
 545    ascii                0x00..0x7F
 546    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 547    eight-bit-graphic    0xA0..0xBF
 548    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 549    ---------------------------------------------
 550
 551    As this is the internal character representation, the format is
 552    usually not used externally (i.e. in a file or in a data sent to a
 553    process).  But, it is possible to have a text externally in this
 554    format (i.e. by encoding by the coding system `emacs-mule').
 555
 556    In that case, a sequence of one-byte codes has a slightly different
 557    form.
 558
 559    Firstly, all characters in eight-bit-control are represented by
 560    one-byte sequences which are their 8-bit code.
 561
 562    Next, character composition data are represented by the byte
 563    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 564    where,
 565         METHOD is 0xF0 plus one of composition method (enum
 566         composition_method),
 567
 568         BYTES is 0xA0 plus the byte length of these composition data,
 569
 570         CHARS is 0xA0 plus the number of characters composed by these
 571         data,
 572
 573         COMPONENTs are characters of multibyte form or composition
 574         rules encoded by two-byte of ASCII codes.
 575
 576    In addition, for backward compatibility, the following formats are
 577    also recognized as composition data on decoding.
 578
 579    0x80 MSEQ ...
 580    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 581
 582    Here,
 583         MSEQ is a multibyte form but in these special format:
 584           ASCII: 0xA0 ASCII_CODE+0x80,
 585           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 586         RULE is a one byte code of the range 0xA0..0xF0 that
 587         represents a composition rule.
 588   */
 589
 590 enum emacs_code_class_type emacs_code_class[256];
 591
 592 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 593    Check if a text is encoded in Emacs' internal format.  If it is,
 594    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 595
 596 static int
 597 detect_coding_emacs_mule (src, src_end, multibytep)
 598       unsigned char *src, *src_end;
 599       int multibytep;
 600 {
 601   unsigned char c;
 602   int composing = 0;
 603   /* Dummy for ONE_MORE_BYTE.  */
 604   struct coding_system dummy_coding;
 605   struct coding_system *coding = &dummy_coding;
 606
 607   while (1)
 608     {
 609       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 610
 611       if (composing)
 612         {
 613           if (c < 0xA0)
 614             composing = 0;
 615           else if (c == 0xA0)
 616             {
 617               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 618               c &= 0x7F;
 619             }
 620           else
 621             c -= 0x20;
 622         }
 623
 624       if (c < 0x20)
 625         {
 626           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 627             return 0;
 628         }
 629       else if (c >= 0x80 && c < 0xA0)
 630         {
 631           if (c == 0x80)
 632             /* Old leading code for a composite character.  */
 633             composing = 1;
 634           else
 635             {
 636               unsigned char *src_base = src - 1;
 637               int bytes;
 638
 639               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 640                                                bytes))
 641                 return 0;
 642               src = src_base + bytes;
 643             }
 644         }
 645     }
 646  label_end_of_loop:
 647   return CODING_CATEGORY_MASK_EMACS_MULE;
 648 }
 649
 650
 651 /* Record the starting position START and METHOD of one composition.  */
 652
 653 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 654   do {                                                          \
 655     struct composition_data *cmp_data = coding->cmp_data;       \
 656     int *data = cmp_data->data + cmp_data->used;                \
 657     coding->cmp_data_start = cmp_data->used;                    \
 658     data[0] = -1;                                               \
 659     data[1] = cmp_data->char_offset + start;                    \
 660     data[3] = (int) method;                                     \
 661     cmp_data->used += 4;                                        \
 662   } while (0)
 663
 664 /* Record the ending position END of the current composition.  */
 665
 666 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 667   do {                                                          \
 668     struct composition_data *cmp_data = coding->cmp_data;       \
 669     int *data = cmp_data->data + coding->cmp_data_start;        \
 670     data[0] = cmp_data->used - coding->cmp_data_start;          \
 671     data[2] = cmp_data->char_offset + end;                      \
 672   } while (0)
 673
 674 /* Record one COMPONENT (alternate character or composition rule).  */
 675
 676 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
 677   (coding->cmp_data->data[coding->cmp_data->used++] = component)
 678
 679
 680 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 681    is not less than SRC_END, return -1 without incrementing Src.  */
 682
 683 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 684
 685
 686 /* Decode a character represented as a component of composition
 687    sequence of Emacs 20 style at SRC.  Set C to that character, store
 688    its multibyte form sequence at P, and set P to the end of that
 689    sequence.  If no valid character is found, set C to -1.  */
 690
 691 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 692   do {                                                          \
 693     int bytes;                                                  \
 694                                                                 \
 695     c = SAFE_ONE_MORE_BYTE ();                                  \
 696     if (c < 0)                                                  \
 697       break;                                                    \
 698     if (CHAR_HEAD_P (c))                                        \
 699       c = -1;                                                   \
 700     else if (c == 0xA0)                                         \
 701       {                                                         \
 702         c = SAFE_ONE_MORE_BYTE ();                              \
 703         if (c < 0xA0)                                           \
 704           c = -1;                                               \
 705         else                                                    \
 706           {                                                     \
 707             c -= 0xA0;                                          \
 708             *p++ = c;                                           \
 709           }                                                     \
 710       }                                                         \
 711     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 712       {                                                         \
 713         unsigned char *p0 = p;                                  \
 714                                                                 \
 715         c -= 0x20;                                              \
 716         *p++ = c;                                               \
 717         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 718         while (--bytes)                                         \
 719           {                                                     \
 720             c = SAFE_ONE_MORE_BYTE ();                          \
 721             if (c < 0)                                          \
 722               break;                                            \
 723             *p++ = c;                                           \
 724           }                                                     \
 725         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes))     \
 726           c = STRING_CHAR (p0, bytes);                          \
 727         else                                                    \
 728           c = -1;                                               \
 729       }                                                         \
 730     else                                                        \
 731       c = -1;                                                   \
 732   } while (0)
 733
 734
 735 /* Decode a composition rule represented as a component of composition
 736    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 737    valid rule is found, set C to -1.  */
 738
 739 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 740   do {                                                  \
 741     c = SAFE_ONE_MORE_BYTE ();                          \
 742     c -= 0xA0;                                          \
 743     if (c < 0 || c >= 81)                               \
 744       c = -1;                                           \
 745     else                                                \
 746       {                                                 \
 747         gref = c / 9, nref = c % 9;                     \
 748         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 749       }                                                 \
 750   } while (0)
 751
 752
 753 /* Decode composition sequence encoded by `emacs-mule' at the source
 754    pointed by SRC.  SRC_END is the end of source.  Store information
 755    of the composition in CODING->cmp_data.
 756
 757    For backward compatibility, decode also a composition sequence of
 758    Emacs 20 style.  In that case, the composition sequence contains
 759    characters that should be extracted into a buffer or string.  Store
 760    those characters at *DESTINATION in multibyte form.
 761
 762    If we encounter an invalid byte sequence, return 0.
 763    If we encounter an insufficient source or destination, or
 764    insufficient space in CODING->cmp_data, return 1.
 765    Otherwise, return consumed bytes in the source.
 766
 767 */
 768 static INLINE int
 769 decode_composition_emacs_mule (coding, src, src_end,
 770                                destination, dst_end, dst_bytes)
 771      struct coding_system *coding;
 772      unsigned char *src, *src_end, **destination, *dst_end;
 773      int dst_bytes;
 774 {
 775   unsigned char *dst = *destination;
 776   int method, data_len, nchars;
 777   unsigned char *src_base = src++;
 778   /* Store components of composition.  */
 779   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 780   int ncomponent;
 781   /* Store multibyte form of characters to be composed.  This is for
 782      Emacs 20 style composition sequence.  */
 783   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 784   unsigned char *bufp = buf;
 785   int c, i, gref, nref;
 786
 787   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 788       >= COMPOSITION_DATA_SIZE)
 789     {
 790       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 791       return -1;
 792     }
 793
 794   ONE_MORE_BYTE (c);
 795   if (c - 0xF0 >= COMPOSITION_RELATIVE
 796            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 797     {
 798       int with_rule;
 799
 800       method = c - 0xF0;
 801       with_rule = (method == COMPOSITION_WITH_RULE
 802                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 803       ONE_MORE_BYTE (c);
 804       data_len = c - 0xA0;
 805       if (data_len < 4
 806           || src_base + data_len > src_end)
 807         return 0;
 808       ONE_MORE_BYTE (c);
 809       nchars = c - 0xA0;
 810       if (c < 1)
 811         return 0;
 812       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 813         {
 814           if (ncomponent % 2 && with_rule)
 815             {
 816               ONE_MORE_BYTE (gref);
 817               gref -= 32;
 818               ONE_MORE_BYTE (nref);
 819               nref -= 32;
 820               c = COMPOSITION_ENCODE_RULE (gref, nref);
 821             }
 822           else
 823             {
 824               int bytes;
 825               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 826                 c = STRING_CHAR (src, bytes);
 827               else
 828                 c = *src, bytes = 1;
 829               src += bytes;
 830             }
 831           component[ncomponent] = c;
 832         }
 833     }
 834   else
 835     {
 836       /* This may be an old Emacs 20 style format.  See the comment at
 837          the section 2 of this file.  */
 838       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 839       if (src == src_end
 840           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 841         goto label_end_of_loop;
 842
 843       src_end = src;
 844       src = src_base + 1;
 845       if (c < 0xC0)
 846         {
 847           method = COMPOSITION_RELATIVE;
 848           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 849             {
 850               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 851               if (c < 0)
 852                 break;
 853               component[ncomponent++] = c;
 854             }
 855           if (ncomponent < 2)
 856             return 0;
 857           nchars = ncomponent;
 858         }
 859       else if (c == 0xFF)
 860         {
 861           method = COMPOSITION_WITH_RULE;
 862           src++;
 863           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 864           if (c < 0)
 865             return 0;
 866           component[0] = c;
 867           for (ncomponent = 1;
 868                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 869             {
 870               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 871               if (c < 0)
 872                 break;
 873               component[ncomponent++] = c;
 874               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 875               if (c < 0)
 876                 break;
 877               component[ncomponent++] = c;
 878             }
 879           if (ncomponent < 3)
 880             return 0;
 881           nchars = (ncomponent + 1) / 2;
 882         }
 883       else
 884         return 0;
 885     }
 886
 887   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 888     {
 889       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 890       for (i = 0; i < ncomponent; i++)
 891         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 892       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 893       if (buf < bufp)
 894         {
 895           unsigned char *p = buf;
 896           EMIT_BYTES (p, bufp);
 897           *destination += bufp - buf;
 898           coding->produced_char += nchars;
 899         }
 900       return (src - src_base);
 901     }
 902  label_end_of_loop:
 903   return -1;
 904 }
 905
 906 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 907
 908 static void
 909 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 910      struct coding_system *coding;
 911      unsigned char *source, *destination;
 912      int src_bytes, dst_bytes;
 913 {
 914   unsigned char *src = source;
 915   unsigned char *src_end = source + src_bytes;
 916   unsigned char *dst = destination;
 917   unsigned char *dst_end = destination + dst_bytes;
 918   /* SRC_BASE remembers the start position in source in each loop.
 919      The loop will be exited when there's not enough source code, or
 920      when there's not enough destination area to produce a
 921      character.  */
 922   unsigned char *src_base;
 923
 924   coding->produced_char = 0;
 925   while ((src_base = src) < src_end)
 926     {
 927       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 928       int bytes;
 929
 930       if (*src == '\r')
 931         {
 932           int c = *src++;
 933
 934           if (coding->eol_type == CODING_EOL_CR)
 935             c = '\n';
 936           else if (coding->eol_type == CODING_EOL_CRLF)
 937             {
 938               ONE_MORE_BYTE (c);
 939               if (c != '\n')
 940                 {
 941                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 942                     {
 943                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
 944                       goto label_end_of_loop;
 945                     }
 946                   src--;
 947                   c = '\r';
 948                 }
 949             }
 950           *dst++ = c;
 951           coding->produced_char++;
 952           continue;
 953         }
 954       else if (*src == '\n')
 955         {
 956           if ((coding->eol_type == CODING_EOL_CR
 957                || coding->eol_type == CODING_EOL_CRLF)
 958               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 959             {
 960               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 961               goto label_end_of_loop;
 962             }
 963           *dst++ = *src++;
 964           coding->produced_char++;
 965           continue;
 966         }
 967       else if (*src == 0x80)
 968         {
 969           /* Start of composition data.  */
 970           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
 971                                                          &dst, dst_end,
 972                                                          dst_bytes);
 973           if (consumed < 0)
 974             goto label_end_of_loop;
 975           else if (consumed > 0)
 976             {
 977               src += consumed;
 978               continue;
 979             }
 980           bytes = CHAR_STRING (*src, tmp);
 981           p = tmp;
 982           src++;
 983         }
 984       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 985         {
 986           p = src;
 987           src += bytes;
 988         }
 989       else
 990         {
 991           bytes = CHAR_STRING (*src, tmp);
 992           p = tmp;
 993           src++;
 994         }
 995       if (dst + bytes >= (dst_bytes ? dst_end : src))
 996         {
 997           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 998           break;
 999         }
1000       while (bytes--) *dst++ = *p++;
1001       coding->produced_char++;
1002     }
1003  label_end_of_loop:
1004   coding->consumed = coding->consumed_char = src_base - source;
1005   coding->produced = dst - destination;
1006 }
1007
1008
1009 /* Encode composition data stored at DATA into a special byte sequence
1010    starting by 0x80.  Update CODING->cmp_data_start and maybe
1011    CODING->cmp_data for the next call.  */
1012
1013 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1014   do {                                                                  \
1015     unsigned char buf[1024], *p0 = buf, *p;                             \
1016     int len = data[0];                                                  \
1017     int i;                                                              \
1018                                                                         \
1019     buf[0] = 0x80;                                                      \
1020     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1021     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1022     p = buf + 4;                                                        \
1023     if (data[3] == COMPOSITION_WITH_RULE                                \
1024         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1025       {                                                                 \
1026         p += CHAR_STRING (data[4], p);                                  \
1027         for (i = 5; i < len; i += 2)                                    \
1028           {                                                             \
1029             int gref, nref;                                             \
1030              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1031             *p++ = 0x20 + gref;                                         \
1032             *p++ = 0x20 + nref;                                         \
1033             p += CHAR_STRING (data[i + 1], p);                          \
1034           }                                                             \
1035       }                                                                 \
1036     else                                                                \
1037       {                                                                 \
1038         for (i = 4; i < len; i++)                                       \
1039           p += CHAR_STRING (data[i], p);                                \
1040       }                                                                 \
1041     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1042                                                                         \
1043     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1044       {                                                                 \
1045         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1046         goto label_end_of_loop;                                         \
1047       }                                                                 \
1048     while (p0 < p)                                                      \
1049       *dst++ = *p0++;                                                   \
1050     coding->cmp_data_start += data[0];                                  \
1051     if (coding->cmp_data_start == coding->cmp_data->used                \
1052         && coding->cmp_data->next)                                      \
1053       {                                                                 \
1054         coding->cmp_data = coding->cmp_data->next;                      \
1055         coding->cmp_data_start = 0;                                     \
1056       }                                                                 \
1057   } while (0)
1058
1059
1060 static void encode_eol P_ ((struct coding_system *, unsigned char *,
1061                             unsigned char *, int, int));
1062
1063 static void
1064 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1065      struct coding_system *coding;
1066      unsigned char *source, *destination;
1067      int src_bytes, dst_bytes;
1068 {
1069   unsigned char *src = source;
1070   unsigned char *src_end = source + src_bytes;
1071   unsigned char *dst = destination;
1072   unsigned char *dst_end = destination + dst_bytes;
1073   unsigned char *src_base;
1074   int c;
1075   int char_offset;
1076   int *data;
1077
1078   Lisp_Object translation_table;
1079
1080   translation_table = Qnil;
1081
1082   /* Optimization for the case that there's no composition.  */
1083   if (!coding->cmp_data || coding->cmp_data->used == 0)
1084     {
1085       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1086       return;
1087     }
1088
1089   char_offset = coding->cmp_data->char_offset;
1090   data = coding->cmp_data->data + coding->cmp_data_start;
1091   while (1)
1092     {
1093       src_base = src;
1094
1095       /* If SRC starts a composition, encode the information about the
1096          composition in advance.  */
1097       if (coding->cmp_data_start < coding->cmp_data->used
1098           && char_offset + coding->consumed_char == data[1])
1099         {
1100           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1101           char_offset = coding->cmp_data->char_offset;
1102           data = coding->cmp_data->data + coding->cmp_data_start;
1103         }
1104
1105       ONE_MORE_CHAR (c);
1106       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1107                         || coding->eol_type == CODING_EOL_CR))
1108         {
1109           if (coding->eol_type == CODING_EOL_CRLF)
1110             EMIT_TWO_BYTES ('\r', c);
1111           else
1112             EMIT_ONE_BYTE ('\r');
1113         }
1114       else if (SINGLE_BYTE_CHAR_P (c))
1115         EMIT_ONE_BYTE (c);
1116       else
1117         EMIT_BYTES (src_base, src);
1118       coding->consumed_char++;
1119     }
1120  label_end_of_loop:
1121   coding->consumed = src_base - source;
1122   coding->produced = coding->produced_char = dst - destination;
1123   return;
1124 }
1125
1126 \f
1127 /*** 3. ISO2022 handlers ***/
1128
1129 /* The following note describes the coding system ISO2022 briefly.
1130    Since the intention of this note is to help understand the
1131    functions in this file, some parts are NOT ACCURATE or are OVERLY
1132    SIMPLIFIED.  For thorough understanding, please refer to the
1133    original document of ISO2022.  This is equivalent to the standard
1134    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1135
1136    ISO2022 provides many mechanisms to encode several character sets
1137    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1138    is encoded using bytes less than 128.  This may make the encoded
1139    text a little bit longer, but the text passes more easily through
1140    several types of gateway, some of which strip off the MSB (Most
1141    Significant Bit).
1142
1143    There are two kinds of character sets: control character sets and
1144    graphic character sets.  The former contain control characters such
1145    as `newline' and `escape' to provide control functions (control
1146    functions are also provided by escape sequences).  The latter
1147    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1148    two control character sets and many graphic character sets.
1149
1150    Graphic character sets are classified into one of the following
1151    four classes, according to the number of bytes (DIMENSION) and
1152    number of characters in one dimension (CHARS) of the set:
1153    - DIMENSION1_CHARS94
1154    - DIMENSION1_CHARS96
1155    - DIMENSION2_CHARS94
1156    - DIMENSION2_CHARS96
1157
1158    In addition, each character set is assigned an identification tag,
1159    unique for each set, called the "final character" (denoted as <F>
1160    hereafter).  The <F> of each character set is decided by ECMA(*)
1161    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1162    (0x30..0x3F are for private use only).
1163
1164    Note (*): ECMA = European Computer Manufacturers Association
1165
1166    Here are examples of graphic character sets [NAME(<F>)]:
1167         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1168         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1169         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1170         o DIMENSION2_CHARS96 -- none for the moment
1171
1172    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1173         C0 [0x00..0x1F] -- control character plane 0
1174         GL [0x20..0x7F] -- graphic character plane 0
1175         C1 [0x80..0x9F] -- control character plane 1
1176         GR [0xA0..0xFF] -- graphic character plane 1
1177
1178    A control character set is directly designated and invoked to C0 or
1179    C1 by an escape sequence.  The most common case is that:
1180    - ISO646's  control character set is designated/invoked to C0, and
1181    - ISO6429's control character set is designated/invoked to C1,
1182    and usually these designations/invocations are omitted in encoded
1183    text.  In a 7-bit environment, only C0 can be used, and a control
1184    character for C1 is encoded by an appropriate escape sequence to
1185    fit into the environment.  All control characters for C1 are
1186    defined to have corresponding escape sequences.
1187
1188    A graphic character set is at first designated to one of four
1189    graphic registers (G0 through G3), then these graphic registers are
1190    invoked to GL or GR.  These designations and invocations can be
1191    done independently.  The most common case is that G0 is invoked to
1192    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1193    these invocations and designations are omitted in encoded text.
1194    In a 7-bit environment, only GL can be used.
1195
1196    When a graphic character set of CHARS94 is invoked to GL, codes
1197    0x20 and 0x7F of the GL area work as control characters SPACE and
1198    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1199    be used.
1200
1201    There are two ways of invocation: locking-shift and single-shift.
1202    With locking-shift, the invocation lasts until the next different
1203    invocation, whereas with single-shift, the invocation affects the
1204    following character only and doesn't affect the locking-shift
1205    state.  Invocations are done by the following control characters or
1206    escape sequences:
1207
1208    ----------------------------------------------------------------------
1209    abbrev  function                  cntrl escape seq   description
1210    ----------------------------------------------------------------------
1211    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1212    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1213    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1214    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1215    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1216    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1217    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1218    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1219    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1220    ----------------------------------------------------------------------
1221    (*) These are not used by any known coding system.
1222
1223    Control characters for these functions are defined by macros
1224    ISO_CODE_XXX in `coding.h'.
1225
1226    Designations are done by the following escape sequences:
1227    ----------------------------------------------------------------------
1228    escape sequence      description
1229    ----------------------------------------------------------------------
1230    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1231    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1232    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1233    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1234    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1235    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1236    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1237    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1238    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1239    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1240    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1241    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1242    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1243    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1244    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1245    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1246    ----------------------------------------------------------------------
1247
1248    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1249    of dimension 1, chars 94, and final character <F>, etc...
1250
1251    Note (*): Although these designations are not allowed in ISO2022,
1252    Emacs accepts them on decoding, and produces them on encoding
1253    CHARS96 character sets in a coding system which is characterized as
1254    7-bit environment, non-locking-shift, and non-single-shift.
1255
1256    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1257    '(' can be omitted.  We refer to this as "short-form" hereafter.
1258
1259    Now you may notice that there are a lot of ways of encoding the
1260    same multilingual text in ISO2022.  Actually, there exist many
1261    coding systems such as Compound Text (used in X11's inter client
1262    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1263    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1264    localized platforms), and all of these are variants of ISO2022.
1265
1266    In addition to the above, Emacs handles two more kinds of escape
1267    sequences: ISO6429's direction specification and Emacs' private
1268    sequence for specifying character composition.
1269
1270    ISO6429's direction specification takes the following form:
1271         o CSI ']'      -- end of the current direction
1272         o CSI '0' ']'  -- end of the current direction
1273         o CSI '1' ']'  -- start of left-to-right text
1274         o CSI '2' ']'  -- start of right-to-left text
1275    The control character CSI (0x9B: control sequence introducer) is
1276    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1277
1278    Character composition specification takes the following form:
1279         o ESC '0' -- start relative composition
1280         o ESC '1' -- end composition
1281         o ESC '2' -- start rule-base composition (*)
1282         o ESC '3' -- start relative composition with alternate chars  (**)
1283         o ESC '4' -- start rule-base composition with alternate chars  (**)
1284   Since these are not standard escape sequences of any ISO standard,
1285   the use of them with these meanings is restricted to Emacs only.
1286
1287   (*) This form is used only in Emacs 20.5 and older versions,
1288   but the newer versions can safely decode it.
1289   (**) This form is used only in Emacs 21.1 and newer versions,
1290   and the older versions can't decode it.
1291
1292   Here's a list of example usages of these composition escape
1293   sequences (categorized by `enum composition_method').
1294
1295   COMPOSITION_RELATIVE:
1296         ESC 0 CHAR [ CHAR ] ESC 1
1297   COMPOSITION_WITH_RULE:
1298         ESC 2 CHAR [ RULE CHAR ] ESC 1
1299   COMPOSITION_WITH_ALTCHARS:
1300         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1301   COMPOSITION_WITH_RULE_ALTCHARS:
1302         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1303
1304 enum iso_code_class_type iso_code_class[256];
1305
1306 #define CHARSET_OK(idx, charset, c)                                     \
1307   (coding_system_table[idx]                                             \
1308    && (charset == CHARSET_ASCII                                         \
1309        || (safe_chars = coding_safe_chars (coding_system_table[idx]),   \
1310            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1311    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1312                                               charset)                  \
1313        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1314
1315 #define SHIFT_OUT_OK(idx) \
1316   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1317
1318 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1319    Check if a text is encoded in ISO2022.  If it is, return an
1320    integer in which appropriate flag bits any of:
1321         CODING_CATEGORY_MASK_ISO_7
1322         CODING_CATEGORY_MASK_ISO_7_TIGHT
1323         CODING_CATEGORY_MASK_ISO_8_1
1324         CODING_CATEGORY_MASK_ISO_8_2
1325         CODING_CATEGORY_MASK_ISO_7_ELSE
1326         CODING_CATEGORY_MASK_ISO_8_ELSE
1327    are set.  If a code which should never appear in ISO2022 is found,
1328    returns 0.  */
1329
1330 static int
1331 detect_coding_iso2022 (src, src_end, multibytep)
1332      unsigned char *src, *src_end;
1333      int multibytep;
1334 {
1335   int mask = CODING_CATEGORY_MASK_ISO;
1336   int mask_found = 0;
1337   int reg[4], shift_out = 0, single_shifting = 0;
1338   int c, c1, charset;
1339   /* Dummy for ONE_MORE_BYTE.  */
1340   struct coding_system dummy_coding;
1341   struct coding_system *coding = &dummy_coding;
1342   Lisp_Object safe_chars;
1343
1344   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1345   while (mask && src < src_end)
1346     {
1347       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1348       switch (c)
1349         {
1350         case ISO_CODE_ESC:
1351           if (inhibit_iso_escape_detection)
1352             break;
1353           single_shifting = 0;
1354           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1355           if (c >= '(' && c <= '/')
1356             {
1357               /* Designation sequence for a charset of dimension 1.  */
1358               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1359               if (c1 < ' ' || c1 >= 0x80
1360                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1361                 /* Invalid designation sequence.  Just ignore.  */
1362                 break;
1363               reg[(c - '(') % 4] = charset;
1364             }
1365           else if (c == '$')
1366             {
1367               /* Designation sequence for a charset of dimension 2.  */
1368               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1369               if (c >= '@' && c <= 'B')
1370                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1371                 reg[0] = charset = iso_charset_table[1][0][c];
1372               else if (c >= '(' && c <= '/')
1373                 {
1374                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1375                   if (c1 < ' ' || c1 >= 0x80
1376                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1377                     /* Invalid designation sequence.  Just ignore.  */
1378                     break;
1379                   reg[(c - '(') % 4] = charset;
1380                 }
1381               else
1382                 /* Invalid designation sequence.  Just ignore.  */
1383                 break;
1384             }
1385           else if (c == 'N' || c == 'O')
1386             {
1387               /* ESC <Fe> for SS2 or SS3.  */
1388               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1389               break;
1390             }
1391           else if (c >= '0' && c <= '4')
1392             {
1393               /* ESC <Fp> for start/end composition.  */
1394               mask_found |= CODING_CATEGORY_MASK_ISO;
1395               break;
1396             }
1397           else
1398             /* Invalid escape sequence.  Just ignore.  */
1399             break;
1400
1401           /* We found a valid designation sequence for CHARSET.  */
1402           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1403           c = MAKE_CHAR (charset, 0, 0);
1404           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1405             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1406           else
1407             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1408           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1409             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1410           else
1411             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1412           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1413             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1414           else
1415             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1416           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1417             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1418           else
1419             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1420           break;
1421
1422         case ISO_CODE_SO:
1423           if (inhibit_iso_escape_detection)
1424             break;
1425           single_shifting = 0;
1426           if (shift_out == 0
1427               && (reg[1] >= 0
1428                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1429                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1430             {
1431               /* Locking shift out.  */
1432               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1433               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1434             }
1435           break;
1436
1437         case ISO_CODE_SI:
1438           if (inhibit_iso_escape_detection)
1439             break;
1440           single_shifting = 0;
1441           if (shift_out == 1)
1442             {
1443               /* Locking shift in.  */
1444               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1445               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1446             }
1447           break;
1448
1449         case ISO_CODE_CSI:
1450           single_shifting = 0;
1451         case ISO_CODE_SS2:
1452         case ISO_CODE_SS3:
1453           {
1454             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1455
1456             if (inhibit_iso_escape_detection)
1457               break;
1458             if (c != ISO_CODE_CSI)
1459               {
1460                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1461                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1462                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1463                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1464                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1465                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1466                 single_shifting = 1;
1467               }
1468             if (VECTORP (Vlatin_extra_code_table)
1469                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1470               {
1471                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1472                     & CODING_FLAG_ISO_LATIN_EXTRA)
1473                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1474                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1475                     & CODING_FLAG_ISO_LATIN_EXTRA)
1476                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1477               }
1478             mask &= newmask;
1479             mask_found |= newmask;
1480           }
1481           break;
1482
1483         default:
1484           if (c < 0x80)
1485             {
1486               single_shifting = 0;
1487               break;
1488             }
1489           else if (c < 0xA0)
1490             {
1491               single_shifting = 0;
1492               if (VECTORP (Vlatin_extra_code_table)
1493                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1494                 {
1495                   int newmask = 0;
1496
1497                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1498                       & CODING_FLAG_ISO_LATIN_EXTRA)
1499                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1500                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1501                       & CODING_FLAG_ISO_LATIN_EXTRA)
1502                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1503                   mask &= newmask;
1504                   mask_found |= newmask;
1505                 }
1506               else
1507                 return 0;
1508             }
1509           else
1510             {
1511               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1512                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1513               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1514               /* Check the length of succeeding codes of the range
1515                  0xA0..0FF.  If the byte length is odd, we exclude
1516                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1517                  when we are not single shifting.  */
1518               if (!single_shifting
1519                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1520                 {
1521                   int i = 1;
1522                   while (src < src_end)
1523                     {
1524                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1525                       if (c < 0xA0)
1526                         break;
1527                       i++;
1528                     }
1529
1530                   if (i & 1 && src < src_end)
1531                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1532                   else
1533                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1534                 }
1535             }
1536           break;
1537         }
1538     }
1539  label_end_of_loop:
1540   return (mask & mask_found);
1541 }
1542
1543 /* Decode a character of which charset is CHARSET, the 1st position
1544    code is C1, the 2nd position code is C2, and return the decoded
1545    character code.  If the variable `translation_table' is non-nil,
1546    returned the translated code.  */
1547
1548 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1549   (NILP (translation_table)                     \
1550    ? MAKE_CHAR (charset, c1, c2)                \
1551    : translate_char (translation_table, -1, charset, c1, c2))
1552
1553 /* Set designation state into CODING.  */
1554 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1555   do {                                                                     \
1556     int charset, c;                                                        \
1557                                                                            \
1558     if (final_char < '0' || final_char >= 128)                             \
1559       goto label_invalid_code;                                             \
1560     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1561                                  make_number (chars),                      \
1562                                  make_number (final_char));                \
1563     c = MAKE_CHAR (charset, 0, 0);                                         \
1564     if (charset >= 0                                                       \
1565         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1566             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1567       {                                                                    \
1568         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1569             && reg == 0                                                    \
1570             && charset == CHARSET_ASCII)                                   \
1571           {                                                                \
1572             /* We should insert this designation sequence as is so         \
1573                that it is surely written back to a file.  */               \
1574             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1575             goto label_invalid_code;                                       \
1576           }                                                                \
1577         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1578         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1579             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1580           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1581         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1582       }                                                                    \
1583     else                                                                   \
1584       {                                                                    \
1585         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1586         goto label_invalid_code;                                           \
1587       }                                                                    \
1588   } while (0)
1589
1590 /* Allocate a memory block for storing information about compositions.
1591    The block is chained to the already allocated blocks.  */
1592
1593 void
1594 coding_allocate_composition_data (coding, char_offset)
1595      struct coding_system *coding;
1596      int char_offset;
1597 {
1598   struct composition_data *cmp_data
1599     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1600
1601   cmp_data->char_offset = char_offset;
1602   cmp_data->used = 0;
1603   cmp_data->prev = coding->cmp_data;
1604   cmp_data->next = NULL;
1605   if (coding->cmp_data)
1606     coding->cmp_data->next = cmp_data;
1607   coding->cmp_data = cmp_data;
1608   coding->cmp_data_start = 0;
1609 }
1610
1611 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1612    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1613    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1614    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1615    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1616   */
1617
1618 #define DECODE_COMPOSITION_START(c1)                                       \
1619   do {                                                                     \
1620     if (coding->composing == COMPOSITION_DISABLED)                         \
1621       {                                                                    \
1622         *dst++ = ISO_CODE_ESC;                                             \
1623         *dst++ = c1 & 0x7f;                                                \
1624         coding->produced_char += 2;                                        \
1625       }                                                                    \
1626     else if (!COMPOSING_P (coding))                                        \
1627       {                                                                    \
1628         /* This is surely the start of a composition.  We must be sure     \
1629            that coding->cmp_data has enough space to store the             \
1630            information about the composition.  If not, terminate the       \
1631            current decoding loop, allocate one more memory block for       \
1632            coding->cmp_data in the caller, then start the decoding         \
1633            loop again.  We can't allocate memory here directly because     \
1634            it may cause buffer/string relocation.  */                      \
1635         if (!coding->cmp_data                                              \
1636             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1637                 >= COMPOSITION_DATA_SIZE))                                 \
1638           {                                                                \
1639             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1640             goto label_end_of_loop;                                        \
1641           }                                                                \
1642         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1643                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1644                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1645                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1646         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1647                                       coding->composing);                  \
1648         coding->composition_rule_follows = 0;                              \
1649       }                                                                    \
1650     else                                                                   \
1651       {                                                                    \
1652         /* We are already handling a composition.  If the method is        \
1653            the following two, the codes following the current escape       \
1654            sequence are actual characters stored in a buffer.  */          \
1655         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1656             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1657           {                                                                \
1658             coding->composing = COMPOSITION_RELATIVE;                      \
1659             coding->composition_rule_follows = 0;                          \
1660           }                                                                \
1661       }                                                                    \
1662   } while (0)
1663
1664 /* Handle composition end sequence ESC 1.  */
1665
1666 #define DECODE_COMPOSITION_END(c1)                                      \
1667   do {                                                                  \
1668     if (! COMPOSING_P (coding))                                         \
1669       {                                                                 \
1670         *dst++ = ISO_CODE_ESC;                                          \
1671         *dst++ = c1;                                                    \
1672         coding->produced_char += 2;                                     \
1673       }                                                                 \
1674     else                                                                \
1675       {                                                                 \
1676         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1677         coding->composing = COMPOSITION_NO;                             \
1678       }                                                                 \
1679   } while (0)
1680
1681 /* Decode a composition rule from the byte C1 (and maybe one more byte
1682    from SRC) and store one encoded composition rule in
1683    coding->cmp_data.  */
1684
1685 #define DECODE_COMPOSITION_RULE(c1)                                     \
1686   do {                                                                  \
1687     int rule = 0;                                                       \
1688     (c1) -= 32;                                                         \
1689     if (c1 < 81)                /* old format (before ver.21) */        \
1690       {                                                                 \
1691         int gref = (c1) / 9;                                            \
1692         int nref = (c1) % 9;                                            \
1693         if (gref == 4) gref = 10;                                       \
1694         if (nref == 4) nref = 10;                                       \
1695         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1696       }                                                                 \
1697     else if (c1 < 93)           /* new format (after ver.21) */         \
1698       {                                                                 \
1699         ONE_MORE_BYTE (c2);                                             \
1700         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1701       }                                                                 \
1702     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1703     coding->composition_rule_follows = 0;                               \
1704   } while (0)
1705
1706
1707 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1708
1709 static void
1710 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1711      struct coding_system *coding;
1712      unsigned char *source, *destination;
1713      int src_bytes, dst_bytes;
1714 {
1715   unsigned char *src = source;
1716   unsigned char *src_end = source + src_bytes;
1717   unsigned char *dst = destination;
1718   unsigned char *dst_end = destination + dst_bytes;
1719   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1720   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1721   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1722   /* SRC_BASE remembers the start position in source in each loop.
1723      The loop will be exited when there's not enough source code
1724      (within macro ONE_MORE_BYTE), or when there's not enough
1725      destination area to produce a character (within macro
1726      EMIT_CHAR).  */
1727   unsigned char *src_base;
1728   int c, charset;
1729   Lisp_Object translation_table;
1730   Lisp_Object safe_chars;
1731
1732   safe_chars = coding_safe_chars (coding);
1733
1734   if (NILP (Venable_character_translation))
1735     translation_table = Qnil;
1736   else
1737     {
1738       translation_table = coding->translation_table_for_decode;
1739       if (NILP (translation_table))
1740         translation_table = Vstandard_translation_table_for_decode;
1741     }
1742
1743   coding->result = CODING_FINISH_NORMAL;
1744
1745   while (1)
1746     {
1747       int c1, c2;
1748
1749       src_base = src;
1750       ONE_MORE_BYTE (c1);
1751
1752       /* We produce no character or one character.  */
1753       switch (iso_code_class [c1])
1754         {
1755         case ISO_0x20_or_0x7F:
1756           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1757             {
1758               DECODE_COMPOSITION_RULE (c1);
1759               continue;
1760             }
1761           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1762             {
1763               /* This is SPACE or DEL.  */
1764               charset = CHARSET_ASCII;
1765               break;
1766             }
1767           /* This is a graphic character, we fall down ...  */
1768
1769         case ISO_graphic_plane_0:
1770           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1771             {
1772               DECODE_COMPOSITION_RULE (c1);
1773               continue;
1774             }
1775           charset = charset0;
1776           break;
1777
1778         case ISO_0xA0_or_0xFF:
1779           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1780               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1781             goto label_invalid_code;
1782           /* This is a graphic character, we fall down ... */
1783
1784         case ISO_graphic_plane_1:
1785           if (charset1 < 0)
1786             goto label_invalid_code;
1787           charset = charset1;
1788           break;
1789
1790         case ISO_control_0:
1791           if (COMPOSING_P (coding))
1792             DECODE_COMPOSITION_END ('1');
1793
1794           /* All ISO2022 control characters in this class have the
1795              same representation in Emacs internal format.  */
1796           if (c1 == '\n'
1797               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1798               && (coding->eol_type == CODING_EOL_CR
1799                   || coding->eol_type == CODING_EOL_CRLF))
1800             {
1801               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1802               goto label_end_of_loop;
1803             }
1804           charset = CHARSET_ASCII;
1805           break;
1806
1807         case ISO_control_1:
1808           if (COMPOSING_P (coding))
1809             DECODE_COMPOSITION_END ('1');
1810           goto label_invalid_code;
1811
1812         case ISO_carriage_return:
1813           if (COMPOSING_P (coding))
1814             DECODE_COMPOSITION_END ('1');
1815
1816           if (coding->eol_type == CODING_EOL_CR)
1817             c1 = '\n';
1818           else if (coding->eol_type == CODING_EOL_CRLF)
1819             {
1820               ONE_MORE_BYTE (c1);
1821               if (c1 != ISO_CODE_LF)
1822                 {
1823                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1824                     {
1825                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1826                       goto label_end_of_loop;
1827                     }
1828                   src--;
1829                   c1 = '\r';
1830                 }
1831             }
1832           charset = CHARSET_ASCII;
1833           break;
1834
1835         case ISO_shift_out:
1836           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1837               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1838             goto label_invalid_code;
1839           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1840           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1841           continue;
1842
1843         case ISO_shift_in:
1844           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1845             goto label_invalid_code;
1846           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1847           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1848           continue;
1849
1850         case ISO_single_shift_2_7:
1851         case ISO_single_shift_2:
1852           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1853             goto label_invalid_code;
1854           /* SS2 is handled as an escape sequence of ESC 'N' */
1855           c1 = 'N';
1856           goto label_escape_sequence;
1857
1858         case ISO_single_shift_3:
1859           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1860             goto label_invalid_code;
1861           /* SS2 is handled as an escape sequence of ESC 'O' */
1862           c1 = 'O';
1863           goto label_escape_sequence;
1864
1865         case ISO_control_sequence_introducer:
1866           /* CSI is handled as an escape sequence of ESC '[' ...  */
1867           c1 = '[';
1868           goto label_escape_sequence;
1869
1870         case ISO_escape:
1871           ONE_MORE_BYTE (c1);
1872         label_escape_sequence:
1873           /* Escape sequences handled by Emacs are invocation,
1874              designation, direction specification, and character
1875              composition specification.  */
1876           switch (c1)
1877             {
1878             case '&':           /* revision of following character set */
1879               ONE_MORE_BYTE (c1);
1880               if (!(c1 >= '@' && c1 <= '~'))
1881                 goto label_invalid_code;
1882               ONE_MORE_BYTE (c1);
1883               if (c1 != ISO_CODE_ESC)
1884                 goto label_invalid_code;
1885               ONE_MORE_BYTE (c1);
1886               goto label_escape_sequence;
1887
1888             case '$':           /* designation of 2-byte character set */
1889               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1890                 goto label_invalid_code;
1891               ONE_MORE_BYTE (c1);
1892               if (c1 >= '@' && c1 <= 'B')
1893                 {       /* designation of JISX0208.1978, GB2312.1980,
1894                            or JISX0208.1980 */
1895                   DECODE_DESIGNATION (0, 2, 94, c1);
1896                 }
1897               else if (c1 >= 0x28 && c1 <= 0x2B)
1898                 {       /* designation of DIMENSION2_CHARS94 character set */
1899                   ONE_MORE_BYTE (c2);
1900                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1901                 }
1902               else if (c1 >= 0x2C && c1 <= 0x2F)
1903                 {       /* designation of DIMENSION2_CHARS96 character set */
1904                   ONE_MORE_BYTE (c2);
1905                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1906                 }
1907               else
1908                 goto label_invalid_code;
1909               /* We must update these variables now.  */
1910               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1911               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1912               continue;
1913
1914             case 'n':           /* invocation of locking-shift-2 */
1915               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1916                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1917                 goto label_invalid_code;
1918               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1919               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1920               continue;
1921
1922             case 'o':           /* invocation of locking-shift-3 */
1923               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1924                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1925                 goto label_invalid_code;
1926               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1927               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1928               continue;
1929
1930             case 'N':           /* invocation of single-shift-2 */
1931               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1932                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1933                 goto label_invalid_code;
1934               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1935               ONE_MORE_BYTE (c1);
1936               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1937                 goto label_invalid_code;
1938               break;
1939
1940             case 'O':           /* invocation of single-shift-3 */
1941               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1942                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1943                 goto label_invalid_code;
1944               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1945               ONE_MORE_BYTE (c1);
1946               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1947                 goto label_invalid_code;
1948               break;
1949
1950             case '0': case '2': case '3': case '4': /* start composition */
1951               DECODE_COMPOSITION_START (c1);
1952               continue;
1953
1954             case '1':           /* end composition */
1955               DECODE_COMPOSITION_END (c1);
1956               continue;
1957
1958             case '[':           /* specification of direction */
1959               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1960                 goto label_invalid_code;
1961               /* For the moment, nested direction is not supported.
1962                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1963                  left-to-right, and nonzero means right-to-left.  */
1964               ONE_MORE_BYTE (c1);
1965               switch (c1)
1966                 {
1967                 case ']':       /* end of the current direction */
1968                   coding->mode &= ~CODING_MODE_DIRECTION;
1969
1970                 case '0':       /* end of the current direction */
1971                 case '1':       /* start of left-to-right direction */
1972                   ONE_MORE_BYTE (c1);
1973                   if (c1 == ']')
1974                     coding->mode &= ~CODING_MODE_DIRECTION;
1975                   else
1976                     goto label_invalid_code;
1977                   break;
1978
1979                 case '2':       /* start of right-to-left direction */
1980                   ONE_MORE_BYTE (c1);
1981                   if (c1 == ']')
1982                     coding->mode |= CODING_MODE_DIRECTION;
1983                   else
1984                     goto label_invalid_code;
1985                   break;
1986
1987                 default:
1988                   goto label_invalid_code;
1989                 }
1990               continue;
1991
1992             default:
1993               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1994                 goto label_invalid_code;
1995               if (c1 >= 0x28 && c1 <= 0x2B)
1996                 {       /* designation of DIMENSION1_CHARS94 character set */
1997                   ONE_MORE_BYTE (c2);
1998                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1999                 }
2000               else if (c1 >= 0x2C && c1 <= 0x2F)
2001                 {       /* designation of DIMENSION1_CHARS96 character set */
2002                   ONE_MORE_BYTE (c2);
2003                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2004                 }
2005               else
2006                 goto label_invalid_code;
2007               /* We must update these variables now.  */
2008               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2009               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2010               continue;
2011             }
2012         }
2013
2014       /* Now we know CHARSET and 1st position code C1 of a character.
2015          Produce a multibyte sequence for that character while getting
2016          2nd position code C2 if necessary.  */
2017       if (CHARSET_DIMENSION (charset) == 2)
2018         {
2019           ONE_MORE_BYTE (c2);
2020           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2021             /* C2 is not in a valid range.  */
2022             goto label_invalid_code;
2023         }
2024       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2025       EMIT_CHAR (c);
2026       continue;
2027
2028     label_invalid_code:
2029       coding->errors++;
2030       if (COMPOSING_P (coding))
2031         DECODE_COMPOSITION_END ('1');
2032       src = src_base;
2033       c = *src++;
2034       EMIT_CHAR (c);
2035     }
2036
2037  label_end_of_loop:
2038   coding->consumed = coding->consumed_char = src_base - source;
2039   coding->produced = dst - destination;
2040   return;
2041 }
2042
2043
2044 /* ISO2022 encoding stuff.  */
2045
2046 /*
2047    It is not enough to say just "ISO2022" on encoding, we have to
2048    specify more details.  In Emacs, each ISO2022 coding system
2049    variant has the following specifications:
2050         1. Initial designation to G0 through G3.
2051         2. Allows short-form designation?
2052         3. ASCII should be designated to G0 before control characters?
2053         4. ASCII should be designated to G0 at end of line?
2054         5. 7-bit environment or 8-bit environment?
2055         6. Use locking-shift?
2056         7. Use Single-shift?
2057    And the following two are only for Japanese:
2058         8. Use ASCII in place of JIS0201-1976-Roman?
2059         9. Use JISX0208-1983 in place of JISX0208-1978?
2060    These specifications are encoded in `coding->flags' as flag bits
2061    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2062    details.
2063 */
2064
2065 /* Produce codes (escape sequence) for designating CHARSET to graphic
2066    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2067    '@', 'A', or 'B' and the coding system CODING allows, produce
2068    designation sequence of short-form.  */
2069
2070 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2071   do {                                                                  \
2072     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2073     char *intermediate_char_94 = "()*+";                                \
2074     char *intermediate_char_96 = ",-./";                                \
2075     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2076                                                                         \
2077     if (revision < 255)                                                 \
2078       {                                                                 \
2079         *dst++ = ISO_CODE_ESC;                                          \
2080         *dst++ = '&';                                                   \
2081         *dst++ = '@' + revision;                                        \
2082       }                                                                 \
2083     *dst++ = ISO_CODE_ESC;                                              \
2084     if (CHARSET_DIMENSION (charset) == 1)                               \
2085       {                                                                 \
2086         if (CHARSET_CHARS (charset) == 94)                              \
2087           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2088         else                                                            \
2089           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2090       }                                                                 \
2091     else                                                                \
2092       {                                                                 \
2093         *dst++ = '$';                                                   \
2094         if (CHARSET_CHARS (charset) == 94)                              \
2095           {                                                             \
2096             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2097                 || reg != 0                                             \
2098                 || final_char < '@' || final_char > 'B')                \
2099               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2100           }                                                             \
2101         else                                                            \
2102           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2103       }                                                                 \
2104     *dst++ = final_char;                                                \
2105     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2106   } while (0)
2107
2108 /* The following two macros produce codes (control character or escape
2109    sequence) for ISO2022 single-shift functions (single-shift-2 and
2110    single-shift-3).  */
2111
2112 #define ENCODE_SINGLE_SHIFT_2                           \
2113   do {                                                  \
2114     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2115       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2116     else                                                \
2117       *dst++ = ISO_CODE_SS2;                            \
2118     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2119   } while (0)
2120
2121 #define ENCODE_SINGLE_SHIFT_3                           \
2122   do {                                                  \
2123     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2124       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2125     else                                                \
2126       *dst++ = ISO_CODE_SS3;                            \
2127     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2128   } while (0)
2129
2130 /* The following four macros produce codes (control character or
2131    escape sequence) for ISO2022 locking-shift functions (shift-in,
2132    shift-out, locking-shift-2, and locking-shift-3).  */
2133
2134 #define ENCODE_SHIFT_IN                         \
2135   do {                                          \
2136     *dst++ = ISO_CODE_SI;                       \
2137     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2138   } while (0)
2139
2140 #define ENCODE_SHIFT_OUT                        \
2141   do {                                          \
2142     *dst++ = ISO_CODE_SO;                       \
2143     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2144   } while (0)
2145
2146 #define ENCODE_LOCKING_SHIFT_2                  \
2147   do {                                          \
2148     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2149     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2150   } while (0)
2151
2152 #define ENCODE_LOCKING_SHIFT_3                  \
2153   do {                                          \
2154     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2155     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2156   } while (0)
2157
2158 /* Produce codes for a DIMENSION1 character whose character set is
2159    CHARSET and whose position-code is C1.  Designation and invocation
2160    sequences are also produced in advance if necessary.  */
2161
2162 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2163   do {                                                                  \
2164     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2165       {                                                                 \
2166         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2167           *dst++ = c1 & 0x7F;                                           \
2168         else                                                            \
2169           *dst++ = c1 | 0x80;                                           \
2170         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2171         break;                                                          \
2172       }                                                                 \
2173     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2174       {                                                                 \
2175         *dst++ = c1 & 0x7F;                                             \
2176         break;                                                          \
2177       }                                                                 \
2178     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2179       {                                                                 \
2180         *dst++ = c1 | 0x80;                                             \
2181         break;                                                          \
2182       }                                                                 \
2183     else                                                                \
2184       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2185          must invoke it, or, at first, designate it to some graphic     \
2186          register.  Then repeat the loop to actually produce the        \
2187          character.  */                                                 \
2188       dst = encode_invocation_designation (charset, coding, dst);       \
2189   } while (1)
2190
2191 /* Produce codes for a DIMENSION2 character whose character set is
2192    CHARSET and whose position-codes are C1 and C2.  Designation and
2193    invocation codes are also produced in advance if necessary.  */
2194
2195 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2196   do {                                                                  \
2197     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2198       {                                                                 \
2199         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2200           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2201         else                                                            \
2202           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2203         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2204         break;                                                          \
2205       }                                                                 \
2206     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2207       {                                                                 \
2208         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2209         break;                                                          \
2210       }                                                                 \
2211     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2212       {                                                                 \
2213         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2214         break;                                                          \
2215       }                                                                 \
2216     else                                                                \
2217       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2218          must invoke it, or, at first, designate it to some graphic     \
2219          register.  Then repeat the loop to actually produce the        \
2220          character.  */                                                 \
2221       dst = encode_invocation_designation (charset, coding, dst);       \
2222   } while (1)
2223
2224 #define ENCODE_ISO_CHARACTER(c)                                 \
2225   do {                                                          \
2226     int charset, c1, c2;                                        \
2227                                                                 \
2228     SPLIT_CHAR (c, charset, c1, c2);                            \
2229     if (CHARSET_DEFINED_P (charset))                            \
2230       {                                                         \
2231         if (CHARSET_DIMENSION (charset) == 1)                   \
2232           {                                                     \
2233             if (charset == CHARSET_ASCII                        \
2234                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2235               charset = charset_latin_jisx0201;                 \
2236             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2237           }                                                     \
2238         else                                                    \
2239           {                                                     \
2240             if (charset == charset_jisx0208                     \
2241                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2242               charset = charset_jisx0208_1978;                  \
2243             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2244           }                                                     \
2245       }                                                         \
2246     else                                                        \
2247       {                                                         \
2248         *dst++ = c1;                                            \
2249         if (c2 >= 0)                                            \
2250           *dst++ = c2;                                          \
2251       }                                                         \
2252   } while (0)
2253
2254
2255 /* Instead of encoding character C, produce one or two `?'s.  */
2256
2257 #define ENCODE_UNSAFE_CHARACTER(c)                                      \
2258   do {                                                                  \
2259     ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);       \
2260     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                           \
2261       ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);     \
2262   } while (0)
2263
2264
2265 /* Produce designation and invocation codes at a place pointed by DST
2266    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2267    Return new DST.  */
2268
2269 unsigned char *
2270 encode_invocation_designation (charset, coding, dst)
2271      int charset;
2272      struct coding_system *coding;
2273      unsigned char *dst;
2274 {
2275   int reg;                      /* graphic register number */
2276
2277   /* At first, check designations.  */
2278   for (reg = 0; reg < 4; reg++)
2279     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2280       break;
2281
2282   if (reg >= 4)
2283     {
2284       /* CHARSET is not yet designated to any graphic registers.  */
2285       /* At first check the requested designation.  */
2286       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2287       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2288         /* Since CHARSET requests no special designation, designate it
2289            to graphic register 0.  */
2290         reg = 0;
2291
2292       ENCODE_DESIGNATION (charset, reg, coding);
2293     }
2294
2295   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2296       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2297     {
2298       /* Since the graphic register REG is not invoked to any graphic
2299          planes, invoke it to graphic plane 0.  */
2300       switch (reg)
2301         {
2302         case 0:                 /* graphic register 0 */
2303           ENCODE_SHIFT_IN;
2304           break;
2305
2306         case 1:                 /* graphic register 1 */
2307           ENCODE_SHIFT_OUT;
2308           break;
2309
2310         case 2:                 /* graphic register 2 */
2311           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2312             ENCODE_SINGLE_SHIFT_2;
2313           else
2314             ENCODE_LOCKING_SHIFT_2;
2315           break;
2316
2317         case 3:                 /* graphic register 3 */
2318           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2319             ENCODE_SINGLE_SHIFT_3;
2320           else
2321             ENCODE_LOCKING_SHIFT_3;
2322           break;
2323         }
2324     }
2325
2326   return dst;
2327 }
2328
2329 /* Produce 2-byte codes for encoded composition rule RULE.  */
2330
2331 #define ENCODE_COMPOSITION_RULE(rule)           \
2332   do {                                          \
2333     int gref, nref;                             \
2334     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2335     *dst++ = 32 + 81 + gref;                    \
2336     *dst++ = 32 + nref;                         \
2337   } while (0)
2338
2339 /* Produce codes for indicating the start of a composition sequence
2340    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2341    which specify information about the composition.  See the comment
2342    in coding.h for the format of DATA.  */
2343
2344 #define ENCODE_COMPOSITION_START(coding, data)                          \
2345   do {                                                                  \
2346     coding->composing = data[3];                                        \
2347     *dst++ = ISO_CODE_ESC;                                              \
2348     if (coding->composing == COMPOSITION_RELATIVE)                      \
2349       *dst++ = '0';                                                     \
2350     else                                                                \
2351       {                                                                 \
2352         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2353                   ? '3' : '4');                                         \
2354         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2355         coding->composition_rule_follows = 0;                           \
2356       }                                                                 \
2357   } while (0)
2358
2359 /* Produce codes for indicating the end of the current composition.  */
2360
2361 #define ENCODE_COMPOSITION_END(coding, data)                    \
2362   do {                                                          \
2363     *dst++ = ISO_CODE_ESC;                                      \
2364     *dst++ = '1';                                               \
2365     coding->cmp_data_start += data[0];                          \
2366     coding->composing = COMPOSITION_NO;                         \
2367     if (coding->cmp_data_start == coding->cmp_data->used        \
2368         && coding->cmp_data->next)                              \
2369       {                                                         \
2370         coding->cmp_data = coding->cmp_data->next;              \
2371         coding->cmp_data_start = 0;                             \
2372       }                                                         \
2373   } while (0)
2374
2375 /* Produce composition start sequence ESC 0.  Here, this sequence
2376    doesn't mean the start of a new composition but means that we have
2377    just produced components (alternate chars and composition rules) of
2378    the composition and the actual text follows in SRC.  */
2379
2380 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2381   do {                                          \
2382     *dst++ = ISO_CODE_ESC;                      \
2383     *dst++ = '0';                               \
2384     coding->composing = COMPOSITION_RELATIVE;   \
2385   } while (0)
2386
2387 /* The following three macros produce codes for indicating direction
2388    of text.  */
2389 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2390   do {                                                  \
2391     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2392       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2393     else                                                \
2394       *dst++ = ISO_CODE_CSI;                            \
2395   } while (0)
2396
2397 #define ENCODE_DIRECTION_R2L    \
2398   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2399
2400 #define ENCODE_DIRECTION_L2R    \
2401   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2402
2403 /* Produce codes for designation and invocation to reset the graphic
2404    planes and registers to initial state.  */
2405 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2406   do {                                                                      \
2407     int reg;                                                                \
2408     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2409       ENCODE_SHIFT_IN;                                                      \
2410     for (reg = 0; reg < 4; reg++)                                           \
2411       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2412           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2413               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2414         ENCODE_DESIGNATION                                                  \
2415           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2416   } while (0)
2417
2418 /* Produce designation sequences of charsets in the line started from
2419    SRC to a place pointed by DST, and return updated DST.
2420
2421    If the current block ends before any end-of-line, we may fail to
2422    find all the necessary designations.  */
2423
2424 static unsigned char *
2425 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2426      struct coding_system *coding;
2427      Lisp_Object translation_table;
2428      unsigned char *src, *src_end, *dst;
2429 {
2430   int charset, c, found = 0, reg;
2431   /* Table of charsets to be designated to each graphic register.  */
2432   int r[4];
2433
2434   for (reg = 0; reg < 4; reg++)
2435     r[reg] = -1;
2436
2437   while (found < 4)
2438     {
2439       ONE_MORE_CHAR (c);
2440       if (c == '\n')
2441         break;
2442
2443       charset = CHAR_CHARSET (c);
2444       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2445       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2446         {
2447           found++;
2448           r[reg] = charset;
2449         }
2450     }
2451
2452  label_end_of_loop:
2453   if (found)
2454     {
2455       for (reg = 0; reg < 4; reg++)
2456         if (r[reg] >= 0
2457             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2458           ENCODE_DESIGNATION (r[reg], reg, coding);
2459     }
2460
2461   return dst;
2462 }
2463
2464 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2465
2466 static void
2467 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2468      struct coding_system *coding;
2469      unsigned char *source, *destination;
2470      int src_bytes, dst_bytes;
2471 {
2472   unsigned char *src = source;
2473   unsigned char *src_end = source + src_bytes;
2474   unsigned char *dst = destination;
2475   unsigned char *dst_end = destination + dst_bytes;
2476   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2477      from DST_END to assure overflow checking is necessary only at the
2478      head of loop.  */
2479   unsigned char *adjusted_dst_end = dst_end - 19;
2480   /* SRC_BASE remembers the start position in source in each loop.
2481      The loop will be exited when there's not enough source text to
2482      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2483      there's not enough destination area to produce encoded codes
2484      (within macro EMIT_BYTES).  */
2485   unsigned char *src_base;
2486   int c;
2487   Lisp_Object translation_table;
2488   Lisp_Object safe_chars;
2489
2490   safe_chars = coding_safe_chars (coding);
2491
2492   if (NILP (Venable_character_translation))
2493     translation_table = Qnil;
2494   else
2495     {
2496       translation_table = coding->translation_table_for_encode;
2497       if (NILP (translation_table))
2498         translation_table = Vstandard_translation_table_for_encode;
2499     }
2500
2501   coding->consumed_char = 0;
2502   coding->errors = 0;
2503   while (1)
2504     {
2505       src_base = src;
2506
2507       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2508         {
2509           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2510           break;
2511         }
2512
2513       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2514           && CODING_SPEC_ISO_BOL (coding))
2515         {
2516           /* We have to produce designation sequences if any now.  */
2517           dst = encode_designation_at_bol (coding, translation_table,
2518                                            src, src_end, dst);
2519           CODING_SPEC_ISO_BOL (coding) = 0;
2520         }
2521
2522       /* Check composition start and end.  */
2523       if (coding->composing != COMPOSITION_DISABLED
2524           && coding->cmp_data_start < coding->cmp_data->used)
2525         {
2526           struct composition_data *cmp_data = coding->cmp_data;
2527           int *data = cmp_data->data + coding->cmp_data_start;
2528           int this_pos = cmp_data->char_offset + coding->consumed_char;
2529
2530           if (coding->composing == COMPOSITION_RELATIVE)
2531             {
2532               if (this_pos == data[2])
2533                 {
2534                   ENCODE_COMPOSITION_END (coding, data);
2535                   cmp_data = coding->cmp_data;
2536                   data = cmp_data->data + coding->cmp_data_start;
2537                 }
2538             }
2539           else if (COMPOSING_P (coding))
2540             {
2541               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2542               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2543                 /* We have consumed components of the composition.
2544                    What follows in SRC is the composition's base
2545                    text.  */
2546                 ENCODE_COMPOSITION_FAKE_START (coding);
2547               else
2548                 {
2549                   int c = cmp_data->data[coding->cmp_data_index++];
2550                   if (coding->composition_rule_follows)
2551                     {
2552                       ENCODE_COMPOSITION_RULE (c);
2553                       coding->composition_rule_follows = 0;
2554                     }
2555                   else
2556                     {
2557                       if (coding->flags & CODING_FLAG_ISO_SAFE
2558                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2559                         ENCODE_UNSAFE_CHARACTER (c);
2560                       else
2561                         ENCODE_ISO_CHARACTER (c);
2562                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2563                         coding->composition_rule_follows = 1;
2564                     }
2565                   continue;
2566                 }
2567             }
2568           if (!COMPOSING_P (coding))
2569             {
2570               if (this_pos == data[1])
2571                 {
2572                   ENCODE_COMPOSITION_START (coding, data);
2573                   continue;
2574                 }
2575             }
2576         }
2577
2578       ONE_MORE_CHAR (c);
2579
2580       /* Now encode the character C.  */
2581       if (c < 0x20 || c == 0x7F)
2582         {
2583           if (c == '\r')
2584             {
2585               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2586                 {
2587                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2588                     ENCODE_RESET_PLANE_AND_REGISTER;
2589                   *dst++ = c;
2590                   continue;
2591                 }
2592               /* fall down to treat '\r' as '\n' ...  */
2593               c = '\n';
2594             }
2595           if (c == '\n')
2596             {
2597               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2598                 ENCODE_RESET_PLANE_AND_REGISTER;
2599               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2600                 bcopy (coding->spec.iso2022.initial_designation,
2601                        coding->spec.iso2022.current_designation,
2602                        sizeof coding->spec.iso2022.initial_designation);
2603               if (coding->eol_type == CODING_EOL_LF
2604                   || coding->eol_type == CODING_EOL_UNDECIDED)
2605                 *dst++ = ISO_CODE_LF;
2606               else if (coding->eol_type == CODING_EOL_CRLF)
2607                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2608               else
2609                 *dst++ = ISO_CODE_CR;
2610               CODING_SPEC_ISO_BOL (coding) = 1;
2611             }
2612           else
2613             {
2614               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2615                 ENCODE_RESET_PLANE_AND_REGISTER;
2616               *dst++ = c;
2617             }
2618         }
2619       else if (ASCII_BYTE_P (c))
2620         ENCODE_ISO_CHARACTER (c);
2621       else if (SINGLE_BYTE_CHAR_P (c))
2622         {
2623           *dst++ = c;
2624           coding->errors++;
2625         }
2626       else if (coding->flags & CODING_FLAG_ISO_SAFE
2627                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2628         ENCODE_UNSAFE_CHARACTER (c);
2629       else
2630         ENCODE_ISO_CHARACTER (c);
2631
2632       coding->consumed_char++;
2633     }
2634
2635  label_end_of_loop:
2636   coding->consumed = src_base - source;
2637   coding->produced = coding->produced_char = dst - destination;
2638 }
2639
2640 \f
2641 /*** 4. SJIS and BIG5 handlers ***/
2642
2643 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2644    quite widely.  So, for the moment, Emacs supports them in the bare
2645    C code.  But, in the future, they may be supported only by CCL.  */
2646
2647 /* SJIS is a coding system encoding three character sets: ASCII, right
2648    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2649    as is.  A character of charset katakana-jisx0201 is encoded by
2650    "position-code + 0x80".  A character of charset japanese-jisx0208
2651    is encoded in 2-byte but two position-codes are divided and shifted
2652    so that it fits in the range below.
2653
2654    --- CODE RANGE of SJIS ---
2655    (character set)      (range)
2656    ASCII                0x00 .. 0x7F
2657    KATAKANA-JISX0201    0xA1 .. 0xDF
2658    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2659             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2660    -------------------------------
2661
2662 */
2663
2664 /* BIG5 is a coding system encoding two character sets: ASCII and
2665    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2666    character set and is encoded in two bytes.
2667
2668    --- CODE RANGE of BIG5 ---
2669    (character set)      (range)
2670    ASCII                0x00 .. 0x7F
2671    Big5 (1st byte)      0xA1 .. 0xFE
2672         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2673    --------------------------
2674
2675    Since the number of characters in Big5 is larger than maximum
2676    characters in Emacs' charset (96x96), it can't be handled as one
2677    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2678    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2679    contains frequently used characters and the latter contains less
2680    frequently used characters.  */
2681
2682 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2683    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2684    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2685    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2686
2687 /* Number of Big5 characters which have the same code in 1st byte.  */
2688 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2689
2690 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2691   do {                                                                  \
2692     unsigned int temp                                                   \
2693       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2694     if (b1 < 0xC9)                                                      \
2695       charset = charset_big5_1;                                         \
2696     else                                                                \
2697       {                                                                 \
2698         charset = charset_big5_2;                                       \
2699         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2700       }                                                                 \
2701     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2702     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2703   } while (0)
2704
2705 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2706   do {                                                                  \
2707     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2708     if (charset == charset_big5_2)                                      \
2709       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2710     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2711     b2 = temp % BIG5_SAME_ROW;                                          \
2712     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2713   } while (0)
2714
2715 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2716    Check if a text is encoded in SJIS.  If it is, return
2717    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2718
2719 static int
2720 detect_coding_sjis (src, src_end, multibytep)
2721      unsigned char *src, *src_end;
2722      int multibytep;
2723 {
2724   int c;
2725   /* Dummy for ONE_MORE_BYTE.  */
2726   struct coding_system dummy_coding;
2727   struct coding_system *coding = &dummy_coding;
2728
2729   while (1)
2730     {
2731       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2732       if (c < 0x80)
2733         continue;
2734       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2735         return 0;
2736       if (c <= 0x9F || c >= 0xE0)
2737         {
2738           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2739           if (c < 0x40 || c == 0x7F || c > 0xFC)
2740             return 0;
2741         }
2742     }
2743  label_end_of_loop:
2744   return CODING_CATEGORY_MASK_SJIS;
2745 }
2746
2747 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2748    Check if a text is encoded in BIG5.  If it is, return
2749    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2750
2751 static int
2752 detect_coding_big5 (src, src_end, multibytep)
2753      unsigned char *src, *src_end;
2754      int multibytep;
2755 {
2756   int c;
2757   /* Dummy for ONE_MORE_BYTE.  */
2758   struct coding_system dummy_coding;
2759   struct coding_system *coding = &dummy_coding;
2760
2761   while (1)
2762     {
2763       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2764       if (c < 0x80)
2765         continue;
2766       if (c < 0xA1 || c > 0xFE)
2767         return 0;
2768       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2769       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2770         return 0;
2771     }
2772  label_end_of_loop:
2773   return CODING_CATEGORY_MASK_BIG5;
2774 }
2775
2776 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2777    Check if a text is encoded in UTF-8.  If it is, return
2778    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2779
2780 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2781 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2782 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2783 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2784 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2785 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2786 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2787
2788 static int
2789 detect_coding_utf_8 (src, src_end, multibytep)
2790      unsigned char *src, *src_end;
2791      int multibytep;
2792 {
2793   unsigned char c;
2794   int seq_maybe_bytes;
2795   /* Dummy for ONE_MORE_BYTE.  */
2796   struct coding_system dummy_coding;
2797   struct coding_system *coding = &dummy_coding;
2798
2799   while (1)
2800     {
2801       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2802       if (UTF_8_1_OCTET_P (c))
2803         continue;
2804       else if (UTF_8_2_OCTET_LEADING_P (c))
2805         seq_maybe_bytes = 1;
2806       else if (UTF_8_3_OCTET_LEADING_P (c))
2807         seq_maybe_bytes = 2;
2808       else if (UTF_8_4_OCTET_LEADING_P (c))
2809         seq_maybe_bytes = 3;
2810       else if (UTF_8_5_OCTET_LEADING_P (c))
2811         seq_maybe_bytes = 4;
2812       else if (UTF_8_6_OCTET_LEADING_P (c))
2813         seq_maybe_bytes = 5;
2814       else
2815         return 0;
2816
2817       do
2818         {
2819           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2820           if (!UTF_8_EXTRA_OCTET_P (c))
2821             return 0;
2822           seq_maybe_bytes--;
2823         }
2824       while (seq_maybe_bytes > 0);
2825     }
2826
2827  label_end_of_loop:
2828   return CODING_CATEGORY_MASK_UTF_8;
2829 }
2830
2831 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2832    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2833    Little Endian (otherwise).  If it is, return
2834    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2835    else return 0.  */
2836
2837 #define UTF_16_INVALID_P(val)   \
2838   (((val) == 0xFFFE)            \
2839    || ((val) == 0xFFFF))
2840
2841 #define UTF_16_HIGH_SURROGATE_P(val) \
2842   (((val) & 0xD800) == 0xD800)
2843
2844 #define UTF_16_LOW_SURROGATE_P(val) \
2845   (((val) & 0xDC00) == 0xDC00)
2846
2847 static int
2848 detect_coding_utf_16 (src, src_end, multibytep)
2849      unsigned char *src, *src_end;
2850      int multibytep;
2851 {
2852   unsigned char c1, c2;
2853   /* Dummy for TWO_MORE_BYTES.  */
2854   struct coding_system dummy_coding;
2855   struct coding_system *coding = &dummy_coding;
2856
2857   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2858   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
2859
2860   if ((c1 == 0xFF) && (c2 == 0xFE))
2861     return CODING_CATEGORY_MASK_UTF_16_LE;
2862   else if ((c1 == 0xFE) && (c2 == 0xFF))
2863     return CODING_CATEGORY_MASK_UTF_16_BE;
2864
2865  label_end_of_loop:
2866   return 0;
2867 }
2868
2869 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2870    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2871
2872 static void
2873 decode_coding_sjis_big5 (coding, source, destination,
2874                          src_bytes, dst_bytes, sjis_p)
2875      struct coding_system *coding;
2876      unsigned char *source, *destination;
2877      int src_bytes, dst_bytes;
2878      int sjis_p;
2879 {
2880   unsigned char *src = source;
2881   unsigned char *src_end = source + src_bytes;
2882   unsigned char *dst = destination;
2883   unsigned char *dst_end = destination + dst_bytes;
2884   /* SRC_BASE remembers the start position in source in each loop.
2885      The loop will be exited when there's not enough source code
2886      (within macro ONE_MORE_BYTE), or when there's not enough
2887      destination area to produce a character (within macro
2888      EMIT_CHAR).  */
2889   unsigned char *src_base;
2890   Lisp_Object translation_table;
2891
2892   if (NILP (Venable_character_translation))
2893     translation_table = Qnil;
2894   else
2895     {
2896       translation_table = coding->translation_table_for_decode;
2897       if (NILP (translation_table))
2898         translation_table = Vstandard_translation_table_for_decode;
2899     }
2900
2901   coding->produced_char = 0;
2902   while (1)
2903     {
2904       int c, charset, c1, c2;
2905
2906       src_base = src;
2907       ONE_MORE_BYTE (c1);
2908
2909       if (c1 < 0x80)
2910         {
2911           charset = CHARSET_ASCII;
2912           if (c1 < 0x20)
2913             {
2914               if (c1 == '\r')
2915                 {
2916                   if (coding->eol_type == CODING_EOL_CRLF)
2917                     {
2918                       ONE_MORE_BYTE (c2);
2919                       if (c2 == '\n')
2920                         c1 = c2;
2921                       else if (coding->mode
2922                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2923                         {
2924                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2925                           goto label_end_of_loop;
2926                         }
2927                       else
2928                         /* To process C2 again, SRC is subtracted by 1.  */
2929                         src--;
2930                     }
2931                   else if (coding->eol_type == CODING_EOL_CR)
2932                     c1 = '\n';
2933                 }
2934               else if (c1 == '\n'
2935                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2936                        && (coding->eol_type == CODING_EOL_CR
2937                            || coding->eol_type == CODING_EOL_CRLF))
2938                 {
2939                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2940                   goto label_end_of_loop;
2941                 }
2942             }
2943         }
2944       else
2945         {
2946           if (sjis_p)
2947             {
2948               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
2949                 goto label_invalid_code;
2950               if (c1 <= 0x9F || c1 >= 0xE0)
2951                 {
2952                   /* SJIS -> JISX0208 */
2953                   ONE_MORE_BYTE (c2);
2954                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2955                     goto label_invalid_code;
2956                   DECODE_SJIS (c1, c2, c1, c2);
2957                   charset = charset_jisx0208;
2958                 }
2959               else
2960                 /* SJIS -> JISX0201-Kana */
2961                 charset = charset_katakana_jisx0201;
2962             }
2963           else
2964             {
2965               /* BIG5 -> Big5 */
2966               if (c1 < 0xA0 || c1 > 0xFE)
2967                 goto label_invalid_code;
2968               ONE_MORE_BYTE (c2);
2969               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2970                 goto label_invalid_code;
2971               DECODE_BIG5 (c1, c2, charset, c1, c2);
2972             }
2973         }
2974
2975       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2976       EMIT_CHAR (c);
2977       continue;
2978
2979     label_invalid_code:
2980       coding->errors++;
2981       src = src_base;
2982       c = *src++;
2983       EMIT_CHAR (c);
2984     }
2985
2986  label_end_of_loop:
2987   coding->consumed = coding->consumed_char = src_base - source;
2988   coding->produced = dst - destination;
2989   return;
2990 }
2991
2992 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2993    This function can encode charsets `ascii', `katakana-jisx0201',
2994    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2995    are sure that all these charsets are registered as official charset
2996    (i.e. do not have extended leading-codes).  Characters of other
2997    charsets are produced without any encoding.  If SJIS_P is 1, encode
2998    SJIS text, else encode BIG5 text.  */
2999
3000 static void
3001 encode_coding_sjis_big5 (coding, source, destination,
3002                          src_bytes, dst_bytes, sjis_p)
3003      struct coding_system *coding;
3004      unsigned char *source, *destination;
3005      int src_bytes, dst_bytes;
3006      int sjis_p;
3007 {
3008   unsigned char *src = source;
3009   unsigned char *src_end = source + src_bytes;
3010   unsigned char *dst = destination;
3011   unsigned char *dst_end = destination + dst_bytes;
3012   /* SRC_BASE remembers the start position in source in each loop.
3013      The loop will be exited when there's not enough source text to
3014      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3015      there's not enough destination area to produce encoded codes
3016      (within macro EMIT_BYTES).  */
3017   unsigned char *src_base;
3018   Lisp_Object translation_table;
3019
3020   if (NILP (Venable_character_translation))
3021     translation_table = Qnil;
3022   else
3023     {
3024       translation_table = coding->translation_table_for_encode;
3025       if (NILP (translation_table))
3026         translation_table = Vstandard_translation_table_for_encode;
3027     }
3028
3029   while (1)
3030     {
3031       int c, charset, c1, c2;
3032
3033       src_base = src;
3034       ONE_MORE_CHAR (c);
3035
3036       /* Now encode the character C.  */
3037       if (SINGLE_BYTE_CHAR_P (c))
3038         {
3039           switch (c)
3040             {
3041             case '\r':
3042               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3043                 {
3044                   EMIT_ONE_BYTE (c);
3045                   break;
3046                 }
3047               c = '\n';
3048             case '\n':
3049               if (coding->eol_type == CODING_EOL_CRLF)
3050                 {
3051                   EMIT_TWO_BYTES ('\r', c);
3052                   break;
3053                 }
3054               else if (coding->eol_type == CODING_EOL_CR)
3055                 c = '\r';
3056             default:
3057               EMIT_ONE_BYTE (c);
3058             }
3059         }
3060       else
3061         {
3062           SPLIT_CHAR (c, charset, c1, c2);
3063           if (sjis_p)
3064             {
3065               if (charset == charset_jisx0208
3066                   || charset == charset_jisx0208_1978)
3067                 {
3068                   ENCODE_SJIS (c1, c2, c1, c2);
3069                   EMIT_TWO_BYTES (c1, c2);
3070                 }
3071               else if (charset == charset_katakana_jisx0201)
3072                 EMIT_ONE_BYTE (c1 | 0x80);
3073               else if (charset == charset_latin_jisx0201)
3074                 EMIT_ONE_BYTE (c1);
3075               else
3076                 /* There's no way other than producing the internal
3077                    codes as is.  */
3078                 EMIT_BYTES (src_base, src);
3079             }
3080           else
3081             {
3082               if (charset == charset_big5_1 || charset == charset_big5_2)
3083                 {
3084                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3085                   EMIT_TWO_BYTES (c1, c2);
3086                 }
3087               else
3088                 /* There's no way other than producing the internal
3089                    codes as is.  */
3090                 EMIT_BYTES (src_base, src);
3091             }
3092         }
3093       coding->consumed_char++;
3094     }
3095
3096  label_end_of_loop:
3097   coding->consumed = src_base - source;
3098   coding->produced = coding->produced_char = dst - destination;
3099 }
3100
3101 \f
3102 /*** 5. CCL handlers ***/
3103
3104 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3105    Check if a text is encoded in a coding system of which
3106    encoder/decoder are written in CCL program.  If it is, return
3107    CODING_CATEGORY_MASK_CCL, else return 0.  */
3108
3109 static int
3110 detect_coding_ccl (src, src_end, multibytep)
3111      unsigned char *src, *src_end;
3112      int multibytep;
3113 {
3114   unsigned char *valid;
3115   int c;
3116   /* Dummy for ONE_MORE_BYTE.  */
3117   struct coding_system dummy_coding;
3118   struct coding_system *coding = &dummy_coding;
3119
3120   /* No coding system is assigned to coding-category-ccl.  */
3121   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3122     return 0;
3123
3124   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3125   while (1)
3126     {
3127       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3128       if (! valid[c])
3129         return 0;
3130     }
3131  label_end_of_loop:
3132   return CODING_CATEGORY_MASK_CCL;
3133 }
3134
3135 \f
3136 /*** 6. End-of-line handlers ***/
3137
3138 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3139
3140 static void
3141 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3142      struct coding_system *coding;
3143      unsigned char *source, *destination;
3144      int src_bytes, dst_bytes;
3145 {
3146   unsigned char *src = source;
3147   unsigned char *dst = destination;
3148   unsigned char *src_end = src + src_bytes;
3149   unsigned char *dst_end = dst + dst_bytes;
3150   Lisp_Object translation_table;
3151   /* SRC_BASE remembers the start position in source in each loop.
3152      The loop will be exited when there's not enough source code
3153      (within macro ONE_MORE_BYTE), or when there's not enough
3154      destination area to produce a character (within macro
3155      EMIT_CHAR).  */
3156   unsigned char *src_base;
3157   int c;
3158
3159   translation_table = Qnil;
3160   switch (coding->eol_type)
3161     {
3162     case CODING_EOL_CRLF:
3163       while (1)
3164         {
3165           src_base = src;
3166           ONE_MORE_BYTE (c);
3167           if (c == '\r')
3168             {
3169               ONE_MORE_BYTE (c);
3170               if (c != '\n')
3171                 {
3172                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3173                     {
3174                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
3175                       goto label_end_of_loop;
3176                     }
3177                   src--;
3178                   c = '\r';
3179                 }
3180             }
3181           else if (c == '\n'
3182                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3183             {
3184               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3185               goto label_end_of_loop;
3186             }
3187           EMIT_CHAR (c);
3188         }
3189       break;
3190
3191     case CODING_EOL_CR:
3192       while (1)
3193         {
3194           src_base = src;
3195           ONE_MORE_BYTE (c);
3196           if (c == '\n')
3197             {
3198               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3199                 {
3200                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3201                   goto label_end_of_loop;
3202                 }
3203             }
3204           else if (c == '\r')
3205             c = '\n';
3206           EMIT_CHAR (c);
3207         }
3208       break;
3209
3210     default:                    /* no need for EOL handling */
3211       while (1)
3212         {
3213           src_base = src;
3214           ONE_MORE_BYTE (c);
3215           EMIT_CHAR (c);
3216         }
3217     }
3218
3219  label_end_of_loop:
3220   coding->consumed = coding->consumed_char = src_base - source;
3221   coding->produced = dst - destination;
3222   return;
3223 }
3224
3225 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3226    format of end-of-line according to `coding->eol_type'.  It also
3227    convert multibyte form 8-bit characters to unibyte if
3228    CODING->src_multibyte is nonzero.  If `coding->mode &
3229    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3230    also means end-of-line.  */
3231
3232 static void
3233 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3234      struct coding_system *coding;
3235      unsigned char *source, *destination;
3236      int src_bytes, dst_bytes;
3237 {
3238   unsigned char *src = source;
3239   unsigned char *dst = destination;
3240   unsigned char *src_end = src + src_bytes;
3241   unsigned char *dst_end = dst + dst_bytes;
3242   Lisp_Object translation_table;
3243   /* SRC_BASE remembers the start position in source in each loop.
3244      The loop will be exited when there's not enough source text to
3245      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3246      there's not enough destination area to produce encoded codes
3247      (within macro EMIT_BYTES).  */
3248   unsigned char *src_base;
3249   int c;
3250   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3251
3252   translation_table = Qnil;
3253   if (coding->src_multibyte
3254       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3255     {
3256       src_end--;
3257       src_bytes--;
3258       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3259     }
3260
3261   if (coding->eol_type == CODING_EOL_CRLF)
3262     {
3263       while (src < src_end)
3264         {
3265           src_base = src;
3266           c = *src++;
3267           if (c >= 0x20)
3268             EMIT_ONE_BYTE (c);
3269           else if (c == '\n' || (c == '\r' && selective_display))
3270             EMIT_TWO_BYTES ('\r', '\n');
3271           else
3272             EMIT_ONE_BYTE (c);
3273         }
3274       src_base = src;
3275     label_end_of_loop:
3276       ;
3277     }
3278   else
3279     {
3280       if (!dst_bytes || src_bytes <= dst_bytes)
3281         {
3282           safe_bcopy (src, dst, src_bytes);
3283           src_base = src_end;
3284           dst += src_bytes;
3285         }
3286       else
3287         {
3288           if (coding->src_multibyte
3289               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3290             dst_bytes--;
3291           safe_bcopy (src, dst, dst_bytes);
3292           src_base = src + dst_bytes;
3293           dst = destination + dst_bytes;
3294           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3295         }
3296       if (coding->eol_type == CODING_EOL_CR)
3297         {
3298           for (src = destination; src < dst; src++)
3299             if (*src == '\n') *src = '\r';
3300         }
3301       else if (selective_display)
3302         {
3303           for (src = destination; src < dst; src++)
3304             if (*src == '\r') *src = '\n';
3305         }
3306     }
3307   if (coding->src_multibyte)
3308     dst = destination + str_as_unibyte (destination, dst - destination);
3309
3310   coding->consumed = src_base - source;
3311   coding->produced = dst - destination;
3312   coding->produced_char = coding->produced;
3313 }
3314
3315 \f
3316 /*** 7. C library functions ***/
3317
3318 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3319    has a property `coding-system'.  The value of this property is a
3320    vector of length 5 (called the coding-vector).  Among elements of
3321    this vector, the first (element[0]) and the fifth (element[4])
3322    carry important information for decoding/encoding.  Before
3323    decoding/encoding, this information should be set in fields of a
3324    structure of type `coding_system'.
3325
3326    The value of the property `coding-system' can be a symbol of another
3327    subsidiary coding-system.  In that case, Emacs gets coding-vector
3328    from that symbol.
3329
3330    `element[0]' contains information to be set in `coding->type'.  The
3331    value and its meaning is as follows:
3332
3333    0 -- coding_type_emacs_mule
3334    1 -- coding_type_sjis
3335    2 -- coding_type_iso2022
3336    3 -- coding_type_big5
3337    4 -- coding_type_ccl encoder/decoder written in CCL
3338    nil -- coding_type_no_conversion
3339    t -- coding_type_undecided (automatic conversion on decoding,
3340                                no-conversion on encoding)
3341
3342    `element[4]' contains information to be set in `coding->flags' and
3343    `coding->spec'.  The meaning varies by `coding->type'.
3344
3345    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3346    of length 32 (of which the first 13 sub-elements are used now).
3347    Meanings of these sub-elements are:
3348
3349    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3350         If the value is an integer of valid charset, the charset is
3351         assumed to be designated to graphic register N initially.
3352
3353         If the value is minus, it is a minus value of charset which
3354         reserves graphic register N, which means that the charset is
3355         not designated initially but should be designated to graphic
3356         register N just before encoding a character in that charset.
3357
3358         If the value is nil, graphic register N is never used on
3359         encoding.
3360
3361    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3362         Each value takes t or nil.  See the section ISO2022 of
3363         `coding.h' for more information.
3364
3365    If `coding->type' is `coding_type_big5', element[4] is t to denote
3366    BIG5-ETen or nil to denote BIG5-HKU.
3367
3368    If `coding->type' takes the other value, element[4] is ignored.
3369
3370    Emacs Lisp's coding systems also carry information about format of
3371    end-of-line in a value of property `eol-type'.  If the value is
3372    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3373    means CODING_EOL_CR.  If it is not integer, it should be a vector
3374    of subsidiary coding systems of which property `eol-type' has one
3375    of the above values.
3376
3377 */
3378
3379 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3380    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3381    is setup so that no conversion is necessary and return -1, else
3382    return 0.  */
3383
3384 int
3385 setup_coding_system (coding_system, coding)
3386      Lisp_Object coding_system;
3387      struct coding_system *coding;
3388 {
3389   Lisp_Object coding_spec, coding_type, eol_type, plist;
3390   Lisp_Object val;
3391
3392   /* At first, zero clear all members.  */
3393   bzero (coding, sizeof (struct coding_system));
3394
3395   /* Initialize some fields required for all kinds of coding systems.  */
3396   coding->symbol = coding_system;
3397   coding->heading_ascii = -1;
3398   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3399   coding->composing = COMPOSITION_DISABLED;
3400   coding->cmp_data = NULL;
3401
3402   if (NILP (coding_system))
3403     goto label_invalid_coding_system;
3404
3405   coding_spec = Fget (coding_system, Qcoding_system);
3406
3407   if (!VECTORP (coding_spec)
3408       || XVECTOR (coding_spec)->size != 5
3409       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3410     goto label_invalid_coding_system;
3411
3412   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3413   if (VECTORP (eol_type))
3414     {
3415       coding->eol_type = CODING_EOL_UNDECIDED;
3416       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3417     }
3418   else if (XFASTINT (eol_type) == 1)
3419     {
3420       coding->eol_type = CODING_EOL_CRLF;
3421       coding->common_flags
3422         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3423     }
3424   else if (XFASTINT (eol_type) == 2)
3425     {
3426       coding->eol_type = CODING_EOL_CR;
3427       coding->common_flags
3428         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3429     }
3430   else
3431     coding->eol_type = CODING_EOL_LF;
3432
3433   coding_type = XVECTOR (coding_spec)->contents[0];
3434   /* Try short cut.  */
3435   if (SYMBOLP (coding_type))
3436     {
3437       if (EQ (coding_type, Qt))
3438         {
3439           coding->type = coding_type_undecided;
3440           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3441         }
3442       else
3443         coding->type = coding_type_no_conversion;
3444       /* Initialize this member.  Any thing other than
3445          CODING_CATEGORY_IDX_UTF_16_BE and
3446          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3447          special treatment in detect_eol.  */
3448       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3449
3450       return 0;
3451     }
3452
3453   /* Get values of coding system properties:
3454      `post-read-conversion', `pre-write-conversion',
3455      `translation-table-for-decode', `translation-table-for-encode'.  */
3456   plist = XVECTOR (coding_spec)->contents[3];
3457   /* Pre & post conversion functions should be disabled if
3458      inhibit_eol_conversion is nonzero.  This is the case that a code
3459      conversion function is called while those functions are running.  */
3460   if (! inhibit_pre_post_conversion)
3461     {
3462       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3463       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3464     }
3465   val = Fplist_get (plist, Qtranslation_table_for_decode);
3466   if (SYMBOLP (val))
3467     val = Fget (val, Qtranslation_table_for_decode);
3468   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3469   val = Fplist_get (plist, Qtranslation_table_for_encode);
3470   if (SYMBOLP (val))
3471     val = Fget (val, Qtranslation_table_for_encode);
3472   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3473   val = Fplist_get (plist, Qcoding_category);
3474   if (!NILP (val))
3475     {
3476       val = Fget (val, Qcoding_category_index);
3477       if (INTEGERP (val))
3478         coding->category_idx = XINT (val);
3479       else
3480         goto label_invalid_coding_system;
3481     }
3482   else
3483     goto label_invalid_coding_system;
3484
3485   /* If the coding system has non-nil `composition' property, enable
3486      composition handling.  */
3487   val = Fplist_get (plist, Qcomposition);
3488   if (!NILP (val))
3489     coding->composing = COMPOSITION_NO;
3490
3491   switch (XFASTINT (coding_type))
3492     {
3493     case 0:
3494       coding->type = coding_type_emacs_mule;
3495       coding->common_flags
3496         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3497       coding->composing = COMPOSITION_NO;
3498       if (!NILP (coding->post_read_conversion))
3499         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3500       if (!NILP (coding->pre_write_conversion))
3501         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3502       break;
3503
3504     case 1:
3505       coding->type = coding_type_sjis;
3506       coding->common_flags
3507         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3508       break;
3509
3510     case 2:
3511       coding->type = coding_type_iso2022;
3512       coding->common_flags
3513         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3514       {
3515         Lisp_Object val, temp;
3516         Lisp_Object *flags;
3517         int i, charset, reg_bits = 0;
3518
3519         val = XVECTOR (coding_spec)->contents[4];
3520
3521         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3522           goto label_invalid_coding_system;
3523
3524         flags = XVECTOR (val)->contents;
3525         coding->flags
3526           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3527              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3528              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3529              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3530              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3531              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3532              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3533              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3534              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3535              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3536              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3537              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3538              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3539              );
3540
3541         /* Invoke graphic register 0 to plane 0.  */
3542         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3543         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3544         CODING_SPEC_ISO_INVOCATION (coding, 1)
3545           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3546         /* Not single shifting at first.  */
3547         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3548         /* Beginning of buffer should also be regarded as bol. */
3549         CODING_SPEC_ISO_BOL (coding) = 1;
3550
3551         for (charset = 0; charset <= MAX_CHARSET; charset++)
3552           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3553         val = Vcharset_revision_alist;
3554         while (CONSP (val))
3555           {
3556             charset = get_charset_id (Fcar_safe (XCAR (val)));
3557             if (charset >= 0
3558                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3559                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3560               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3561             val = XCDR (val);
3562           }
3563
3564         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3565            FLAGS[REG] can be one of below:
3566                 integer CHARSET: CHARSET occupies register I,
3567                 t: designate nothing to REG initially, but can be used
3568                   by any charsets,
3569                 list of integer, nil, or t: designate the first
3570                   element (if integer) to REG initially, the remaining
3571                   elements (if integer) is designated to REG on request,
3572                   if an element is t, REG can be used by any charsets,
3573                 nil: REG is never used.  */
3574         for (charset = 0; charset <= MAX_CHARSET; charset++)
3575           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3576             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3577         for (i = 0; i < 4; i++)
3578           {
3579             if ((INTEGERP (flags[i])
3580                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3581                 || (charset = get_charset_id (flags[i])) >= 0)
3582               {
3583                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3584                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3585               }
3586             else if (EQ (flags[i], Qt))
3587               {
3588                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3589                 reg_bits |= 1 << i;
3590                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3591               }
3592             else if (CONSP (flags[i]))
3593               {
3594                 Lisp_Object tail;
3595                 tail = flags[i];
3596
3597                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3598                 if ((INTEGERP (XCAR (tail))
3599                      && (charset = XINT (XCAR (tail)),
3600                          CHARSET_VALID_P (charset)))
3601                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3602                   {
3603                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3604                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3605                   }
3606                 else
3607                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3608                 tail = XCDR (tail);
3609                 while (CONSP (tail))
3610                   {
3611                     if ((INTEGERP (XCAR (tail))
3612                          && (charset = XINT (XCAR (tail)),
3613                              CHARSET_VALID_P (charset)))
3614                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3615                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3616                         = i;
3617                     else if (EQ (XCAR (tail), Qt))
3618                       reg_bits |= 1 << i;
3619                     tail = XCDR (tail);
3620                   }
3621               }
3622             else
3623               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3624
3625             CODING_SPEC_ISO_DESIGNATION (coding, i)
3626               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3627           }
3628
3629         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3630           {
3631             /* REG 1 can be used only by locking shift in 7-bit env.  */
3632             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3633               reg_bits &= ~2;
3634             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3635               /* Without any shifting, only REG 0 and 1 can be used.  */
3636               reg_bits &= 3;
3637           }
3638
3639         if (reg_bits)
3640           for (charset = 0; charset <= MAX_CHARSET; charset++)
3641             {
3642               if (CHARSET_DEFINED_P (charset)
3643                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3644                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3645                 {
3646                   /* There exist some default graphic registers to be
3647                      used by CHARSET.  */
3648
3649                   /* We had better avoid designating a charset of
3650                      CHARS96 to REG 0 as far as possible.  */
3651                   if (CHARSET_CHARS (charset) == 96)
3652                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3653                       = (reg_bits & 2
3654                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3655                   else
3656                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3657                       = (reg_bits & 1
3658                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3659                 }
3660             }
3661       }
3662       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3663       coding->spec.iso2022.last_invalid_designation_register = -1;
3664       break;
3665
3666     case 3:
3667       coding->type = coding_type_big5;
3668       coding->common_flags
3669         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3670       coding->flags
3671         = (NILP (XVECTOR (coding_spec)->contents[4])
3672            ? CODING_FLAG_BIG5_HKU
3673            : CODING_FLAG_BIG5_ETEN);
3674       break;
3675
3676     case 4:
3677       coding->type = coding_type_ccl;
3678       coding->common_flags
3679         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3680       {
3681         val = XVECTOR (coding_spec)->contents[4];
3682         if (! CONSP (val)
3683             || setup_ccl_program (&(coding->spec.ccl.decoder),
3684                                   XCAR (val)) < 0
3685             || setup_ccl_program (&(coding->spec.ccl.encoder),
3686                                   XCDR (val)) < 0)
3687           goto label_invalid_coding_system;
3688
3689         bzero (coding->spec.ccl.valid_codes, 256);
3690         val = Fplist_get (plist, Qvalid_codes);
3691         if (CONSP (val))
3692           {
3693             Lisp_Object this;
3694
3695             for (; CONSP (val); val = XCDR (val))
3696               {
3697                 this = XCAR (val);
3698                 if (INTEGERP (this)
3699                     && XINT (this) >= 0 && XINT (this) < 256)
3700                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3701                 else if (CONSP (this)
3702                          && INTEGERP (XCAR (this))
3703                          && INTEGERP (XCDR (this)))
3704                   {
3705                     int start = XINT (XCAR (this));
3706                     int end = XINT (XCDR (this));
3707
3708                     if (start >= 0 && start <= end && end < 256)
3709                       while (start <= end)
3710                         coding->spec.ccl.valid_codes[start++] = 1;
3711                   }
3712               }
3713           }
3714       }
3715       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3716       coding->spec.ccl.cr_carryover = 0;
3717       coding->spec.ccl.eight_bit_carryover[0] = 0;
3718       break;
3719
3720     case 5:
3721       coding->type = coding_type_raw_text;
3722       break;
3723
3724     default:
3725       goto label_invalid_coding_system;
3726     }
3727   return 0;
3728
3729  label_invalid_coding_system:
3730   coding->type = coding_type_no_conversion;
3731   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3732   coding->common_flags = 0;
3733   coding->eol_type = CODING_EOL_LF;
3734   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3735   return -1;
3736 }
3737
3738 /* Free memory blocks allocated for storing composition information.  */
3739
3740 void
3741 coding_free_composition_data (coding)
3742      struct coding_system *coding;
3743 {
3744   struct composition_data *cmp_data = coding->cmp_data, *next;
3745
3746   if (!cmp_data)
3747     return;
3748   /* Memory blocks are chained.  At first, rewind to the first, then,
3749      free blocks one by one.  */
3750   while (cmp_data->prev)
3751     cmp_data = cmp_data->prev;
3752   while (cmp_data)
3753     {
3754       next = cmp_data->next;
3755       xfree (cmp_data);
3756       cmp_data = next;
3757     }
3758   coding->cmp_data = NULL;
3759 }
3760
3761 /* Set `char_offset' member of all memory blocks pointed by
3762    coding->cmp_data to POS.  */
3763
3764 void
3765 coding_adjust_composition_offset (coding, pos)
3766      struct coding_system *coding;
3767      int pos;
3768 {
3769   struct composition_data *cmp_data;
3770
3771   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3772     cmp_data->char_offset = pos;
3773 }
3774
3775 /* Setup raw-text or one of its subsidiaries in the structure
3776    coding_system CODING according to the already setup value eol_type
3777    in CODING.  CODING should be setup for some coding system in
3778    advance.  */
3779
3780 void
3781 setup_raw_text_coding_system (coding)
3782      struct coding_system *coding;
3783 {
3784   if (coding->type != coding_type_raw_text)
3785     {
3786       coding->symbol = Qraw_text;
3787       coding->type = coding_type_raw_text;
3788       if (coding->eol_type != CODING_EOL_UNDECIDED)
3789         {
3790           Lisp_Object subsidiaries;
3791           subsidiaries = Fget (Qraw_text, Qeol_type);
3792
3793           if (VECTORP (subsidiaries)
3794               && XVECTOR (subsidiaries)->size == 3)
3795             coding->symbol
3796               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3797         }
3798       setup_coding_system (coding->symbol, coding);
3799     }
3800   return;
3801 }
3802
3803 /* Emacs has a mechanism to automatically detect a coding system if it
3804    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3805    it's impossible to distinguish some coding systems accurately
3806    because they use the same range of codes.  So, at first, coding
3807    systems are categorized into 7, those are:
3808
3809    o coding-category-emacs-mule
3810
3811         The category for a coding system which has the same code range
3812         as Emacs' internal format.  Assigned the coding-system (Lisp
3813         symbol) `emacs-mule' by default.
3814
3815    o coding-category-sjis
3816
3817         The category for a coding system which has the same code range
3818         as SJIS.  Assigned the coding-system (Lisp
3819         symbol) `japanese-shift-jis' by default.
3820
3821    o coding-category-iso-7
3822
3823         The category for a coding system which has the same code range
3824         as ISO2022 of 7-bit environment.  This doesn't use any locking
3825         shift and single shift functions.  This can encode/decode all
3826         charsets.  Assigned the coding-system (Lisp symbol)
3827         `iso-2022-7bit' by default.
3828
3829    o coding-category-iso-7-tight
3830
3831         Same as coding-category-iso-7 except that this can
3832         encode/decode only the specified charsets.
3833
3834    o coding-category-iso-8-1
3835
3836         The category for a coding system which has the same code range
3837         as ISO2022 of 8-bit environment and graphic plane 1 used only
3838         for DIMENSION1 charset.  This doesn't use any locking shift
3839         and single shift functions.  Assigned the coding-system (Lisp
3840         symbol) `iso-latin-1' by default.
3841
3842    o coding-category-iso-8-2
3843
3844         The category for a coding system which has the same code range
3845         as ISO2022 of 8-bit environment and graphic plane 1 used only
3846         for DIMENSION2 charset.  This doesn't use any locking shift
3847         and single shift functions.  Assigned the coding-system (Lisp
3848         symbol) `japanese-iso-8bit' by default.
3849
3850    o coding-category-iso-7-else
3851
3852         The category for a coding system which has the same code range
3853         as ISO2022 of 7-bit environment but uses locking shift or
3854         single shift functions.  Assigned the coding-system (Lisp
3855         symbol) `iso-2022-7bit-lock' by default.
3856
3857    o coding-category-iso-8-else
3858
3859         The category for a coding system which has the same code range
3860         as ISO2022 of 8-bit environment but uses locking shift or
3861         single shift functions.  Assigned the coding-system (Lisp
3862         symbol) `iso-2022-8bit-ss2' by default.
3863
3864    o coding-category-big5
3865
3866         The category for a coding system which has the same code range
3867         as BIG5.  Assigned the coding-system (Lisp symbol)
3868         `cn-big5' by default.
3869
3870    o coding-category-utf-8
3871
3872         The category for a coding system which has the same code range
3873         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3874         symbol) `utf-8' by default.
3875
3876    o coding-category-utf-16-be
3877
3878         The category for a coding system in which a text has an
3879         Unicode signature (cf. Unicode Standard) in the order of BIG
3880         endian at the head.  Assigned the coding-system (Lisp symbol)
3881         `utf-16-be' by default.
3882
3883    o coding-category-utf-16-le
3884
3885         The category for a coding system in which a text has an
3886         Unicode signature (cf. Unicode Standard) in the order of
3887         LITTLE endian at the head.  Assigned the coding-system (Lisp
3888         symbol) `utf-16-le' by default.
3889
3890    o coding-category-ccl
3891
3892         The category for a coding system of which encoder/decoder is
3893         written in CCL programs.  The default value is nil, i.e., no
3894         coding system is assigned.
3895
3896    o coding-category-binary
3897
3898         The category for a coding system not categorized in any of the
3899         above.  Assigned the coding-system (Lisp symbol)
3900         `no-conversion' by default.
3901
3902    Each of them is a Lisp symbol and the value is an actual
3903    `coding-system' (this is also a Lisp symbol) assigned by a user.
3904    What Emacs does actually is to detect a category of coding system.
3905    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3906    decide a single possible category, it selects a category of the
3907    highest priority.  Priorities of categories are also specified by a
3908    user in a Lisp variable `coding-category-list'.
3909
3910 */
3911
3912 static
3913 int ascii_skip_code[256];
3914
3915 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3916    If it detects possible coding systems, return an integer in which
3917    appropriate flag bits are set.  Flag bits are defined by macros
3918    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3919    it should point the table `coding_priorities'.  In that case, only
3920    the flag bit for a coding system of the highest priority is set in
3921    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
3922    range 0x80..0x9F are in multibyte form.
3923
3924    How many ASCII characters are at the head is returned as *SKIP.  */
3925
3926 static int
3927 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
3928      unsigned char *source;
3929      int src_bytes, *priorities, *skip;
3930      int multibytep;
3931 {
3932   register unsigned char c;
3933   unsigned char *src = source, *src_end = source + src_bytes;
3934   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3935   int i;
3936
3937   /* At first, skip all ASCII characters and control characters except
3938      for three ISO2022 specific control characters.  */
3939   ascii_skip_code[ISO_CODE_SO] = 0;
3940   ascii_skip_code[ISO_CODE_SI] = 0;
3941   ascii_skip_code[ISO_CODE_ESC] = 0;
3942
3943  label_loop_detect_coding:
3944   while (src < src_end && ascii_skip_code[*src]) src++;
3945   *skip = src - source;
3946
3947   if (src >= src_end)
3948     /* We found nothing other than ASCII.  There's nothing to do.  */
3949     return 0;
3950
3951   c = *src;
3952   /* The text seems to be encoded in some multilingual coding system.
3953      Now, try to find in which coding system the text is encoded.  */
3954   if (c < 0x80)
3955     {
3956       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3957       /* C is an ISO2022 specific control code of C0.  */
3958       mask = detect_coding_iso2022 (src, src_end, multibytep);
3959       if (mask == 0)
3960         {
3961           /* No valid ISO2022 code follows C.  Try again.  */
3962           src++;
3963           if (c == ISO_CODE_ESC)
3964             ascii_skip_code[ISO_CODE_ESC] = 1;
3965           else
3966             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3967           goto label_loop_detect_coding;
3968         }
3969       if (priorities)
3970         {
3971           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3972             {
3973               if (mask & priorities[i])
3974                 return priorities[i];
3975             }
3976           return CODING_CATEGORY_MASK_RAW_TEXT;
3977         }
3978     }
3979   else
3980     {
3981       int try;
3982
3983       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
3984         c = src[1] - 0x20;
3985
3986       if (c < 0xA0)
3987         {
3988           /* C is the first byte of SJIS character code,
3989              or a leading-code of Emacs' internal format (emacs-mule),
3990              or the first byte of UTF-16.  */
3991           try = (CODING_CATEGORY_MASK_SJIS
3992                   | CODING_CATEGORY_MASK_EMACS_MULE
3993                   | CODING_CATEGORY_MASK_UTF_16_BE
3994                   | CODING_CATEGORY_MASK_UTF_16_LE);
3995
3996           /* Or, if C is a special latin extra code,
3997              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3998              or is an ISO2022 control-sequence-introducer (CSI),
3999              we should also consider the possibility of ISO2022 codings.  */
4000           if ((VECTORP (Vlatin_extra_code_table)
4001                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4002               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4003               || (c == ISO_CODE_CSI
4004                   && (src < src_end
4005                       && (*src == ']'
4006                           || ((*src == '0' || *src == '1' || *src == '2')
4007                               && src + 1 < src_end
4008                               && src[1] == ']')))))
4009             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4010                      | CODING_CATEGORY_MASK_ISO_8BIT);
4011         }
4012       else
4013         /* C is a character of ISO2022 in graphic plane right,
4014            or a SJIS's 1-byte character code (i.e. JISX0201),
4015            or the first byte of BIG5's 2-byte code,
4016            or the first byte of UTF-8/16.  */
4017         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4018                 | CODING_CATEGORY_MASK_ISO_8BIT
4019                 | CODING_CATEGORY_MASK_SJIS
4020                 | CODING_CATEGORY_MASK_BIG5
4021                 | CODING_CATEGORY_MASK_UTF_8
4022                 | CODING_CATEGORY_MASK_UTF_16_BE
4023                 | CODING_CATEGORY_MASK_UTF_16_LE);
4024
4025       /* Or, we may have to consider the possibility of CCL.  */
4026       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4027           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4028               ->spec.ccl.valid_codes)[c])
4029         try |= CODING_CATEGORY_MASK_CCL;
4030
4031       mask = 0;
4032       utf16_examined_p = iso2022_examined_p = 0;
4033       if (priorities)
4034         {
4035           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4036             {
4037               if (!iso2022_examined_p
4038                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4039                 {
4040                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4041                   iso2022_examined_p = 1;
4042                 }
4043               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4044                 mask |= detect_coding_sjis (src, src_end, multibytep);
4045               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4046                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4047               else if (!utf16_examined_p
4048                        && (priorities[i] & try &
4049                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4050                 {
4051                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4052                   utf16_examined_p = 1;
4053                 }
4054               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4055                 mask |= detect_coding_big5 (src, src_end, multibytep);
4056               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4057                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4058               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4059                 mask |= detect_coding_ccl (src, src_end, multibytep);
4060               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4061                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4062               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4063                 mask |= CODING_CATEGORY_MASK_BINARY;
4064               if (mask & priorities[i])
4065                 return priorities[i];
4066             }
4067           return CODING_CATEGORY_MASK_RAW_TEXT;
4068         }
4069       if (try & CODING_CATEGORY_MASK_ISO)
4070         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4071       if (try & CODING_CATEGORY_MASK_SJIS)
4072         mask |= detect_coding_sjis (src, src_end, multibytep);
4073       if (try & CODING_CATEGORY_MASK_BIG5)
4074         mask |= detect_coding_big5 (src, src_end, multibytep);
4075       if (try & CODING_CATEGORY_MASK_UTF_8)
4076         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4077       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4078         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4079       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4080         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4081       if (try & CODING_CATEGORY_MASK_CCL)
4082         mask |= detect_coding_ccl (src, src_end, multibytep);
4083     }
4084   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4085 }
4086
4087 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4088    The information of the detected coding system is set in CODING.  */
4089
4090 void
4091 detect_coding (coding, src, src_bytes)
4092      struct coding_system *coding;
4093      unsigned char *src;
4094      int src_bytes;
4095 {
4096   unsigned int idx;
4097   int skip, mask;
4098   Lisp_Object val;
4099
4100   val = Vcoding_category_list;
4101   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4102                              coding->src_multibyte);
4103   coding->heading_ascii = skip;
4104
4105   if (!mask) return;
4106
4107   /* We found a single coding system of the highest priority in MASK.  */
4108   idx = 0;
4109   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4110   if (! mask)
4111     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4112
4113   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4114
4115   if (coding->eol_type != CODING_EOL_UNDECIDED)
4116     {
4117       Lisp_Object tmp;
4118
4119       tmp = Fget (val, Qeol_type);
4120       if (VECTORP (tmp))
4121         val = XVECTOR (tmp)->contents[coding->eol_type];
4122     }
4123
4124   /* Setup this new coding system while preserving some slots.  */
4125   {
4126     int src_multibyte = coding->src_multibyte;
4127     int dst_multibyte = coding->dst_multibyte;
4128
4129     setup_coding_system (val, coding);
4130     coding->src_multibyte = src_multibyte;
4131     coding->dst_multibyte = dst_multibyte;
4132     coding->heading_ascii = skip;
4133   }
4134 }
4135
4136 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4137    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4138    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4139
4140    How many non-eol characters are at the head is returned as *SKIP.  */
4141
4142 #define MAX_EOL_CHECK_COUNT 3
4143
4144 static int
4145 detect_eol_type (source, src_bytes, skip)
4146      unsigned char *source;
4147      int src_bytes, *skip;
4148 {
4149   unsigned char *src = source, *src_end = src + src_bytes;
4150   unsigned char c;
4151   int total = 0;                /* How many end-of-lines are found so far.  */
4152   int eol_type = CODING_EOL_UNDECIDED;
4153   int this_eol_type;
4154
4155   *skip = 0;
4156
4157   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4158     {
4159       c = *src++;
4160       if (c == '\n' || c == '\r')
4161         {
4162           if (*skip == 0)
4163             *skip = src - 1 - source;
4164           total++;
4165           if (c == '\n')
4166             this_eol_type = CODING_EOL_LF;
4167           else if (src >= src_end || *src != '\n')
4168             this_eol_type = CODING_EOL_CR;
4169           else
4170             this_eol_type = CODING_EOL_CRLF, src++;
4171
4172           if (eol_type == CODING_EOL_UNDECIDED)
4173             /* This is the first end-of-line.  */
4174             eol_type = this_eol_type;
4175           else if (eol_type != this_eol_type)
4176             {
4177               /* The found type is different from what found before.  */
4178               eol_type = CODING_EOL_INCONSISTENT;
4179               break;
4180             }
4181         }
4182     }
4183
4184   if (*skip == 0)
4185     *skip = src_end - source;
4186   return eol_type;
4187 }
4188
4189 /* Like detect_eol_type, but detect EOL type in 2-octet
4190    big-endian/little-endian format for coding systems utf-16-be and
4191    utf-16-le.  */
4192
4193 static int
4194 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4195      unsigned char *source;
4196      int src_bytes, *skip, big_endian_p;
4197 {
4198   unsigned char *src = source, *src_end = src + src_bytes;
4199   unsigned int c1, c2;
4200   int total = 0;                /* How many end-of-lines are found so far.  */
4201   int eol_type = CODING_EOL_UNDECIDED;
4202   int this_eol_type;
4203   int msb, lsb;
4204
4205   if (big_endian_p)
4206     msb = 0, lsb = 1;
4207   else
4208     msb = 1, lsb = 0;
4209
4210   *skip = 0;
4211
4212   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4213     {
4214       c1 = (src[msb] << 8) | (src[lsb]);
4215       src += 2;
4216
4217       if (c1 == '\n' || c1 == '\r')
4218         {
4219           if (*skip == 0)
4220             *skip = src - 2 - source;
4221           total++;
4222           if (c1 == '\n')
4223             {
4224               this_eol_type = CODING_EOL_LF;
4225             }
4226           else
4227             {
4228               if ((src + 1) >= src_end)
4229                 {
4230                   this_eol_type = CODING_EOL_CR;
4231                 }
4232               else
4233                 {
4234                   c2 = (src[msb] << 8) | (src[lsb]);
4235                   if (c2 == '\n')
4236                     this_eol_type = CODING_EOL_CRLF, src += 2;
4237                   else
4238                     this_eol_type = CODING_EOL_CR;
4239                 }
4240             }
4241
4242           if (eol_type == CODING_EOL_UNDECIDED)
4243             /* This is the first end-of-line.  */
4244             eol_type = this_eol_type;
4245           else if (eol_type != this_eol_type)
4246             {
4247               /* The found type is different from what found before.  */
4248               eol_type = CODING_EOL_INCONSISTENT;
4249               break;
4250             }
4251         }
4252     }
4253
4254   if (*skip == 0)
4255     *skip = src_end - source;
4256   return eol_type;
4257 }
4258
4259 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4260    is encoded.  If it detects an appropriate format of end-of-line, it
4261    sets the information in *CODING.  */
4262
4263 void
4264 detect_eol (coding, src, src_bytes)
4265      struct coding_system *coding;
4266      unsigned char *src;
4267      int src_bytes;
4268 {
4269   Lisp_Object val;
4270   int skip;
4271   int eol_type;
4272
4273   switch (coding->category_idx)
4274     {
4275     case CODING_CATEGORY_IDX_UTF_16_BE:
4276       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4277       break;
4278     case CODING_CATEGORY_IDX_UTF_16_LE:
4279       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4280       break;
4281     default:
4282       eol_type = detect_eol_type (src, src_bytes, &skip);
4283       break;
4284     }
4285
4286   if (coding->heading_ascii > skip)
4287     coding->heading_ascii = skip;
4288   else
4289     skip = coding->heading_ascii;
4290
4291   if (eol_type == CODING_EOL_UNDECIDED)
4292     return;
4293   if (eol_type == CODING_EOL_INCONSISTENT)
4294     {
4295 #if 0
4296       /* This code is suppressed until we find a better way to
4297          distinguish raw text file and binary file.  */
4298
4299       /* If we have already detected that the coding is raw-text, the
4300          coding should actually be no-conversion.  */
4301       if (coding->type == coding_type_raw_text)
4302         {
4303           setup_coding_system (Qno_conversion, coding);
4304           return;
4305         }
4306       /* Else, let's decode only text code anyway.  */
4307 #endif /* 0 */
4308       eol_type = CODING_EOL_LF;
4309     }
4310
4311   val = Fget (coding->symbol, Qeol_type);
4312   if (VECTORP (val) && XVECTOR (val)->size == 3)
4313     {
4314       int src_multibyte = coding->src_multibyte;
4315       int dst_multibyte = coding->dst_multibyte;
4316
4317       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4318       coding->src_multibyte = src_multibyte;
4319       coding->dst_multibyte = dst_multibyte;
4320       coding->heading_ascii = skip;
4321     }
4322 }
4323
4324 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4325
4326 #define DECODING_BUFFER_MAG(coding)                     \
4327   (coding->type == coding_type_iso2022                  \
4328    ? 3                                                  \
4329    : (coding->type == coding_type_ccl                   \
4330       ? coding->spec.ccl.decoder.buf_magnification      \
4331       : 2))
4332
4333 /* Return maximum size (bytes) of a buffer enough for decoding
4334    SRC_BYTES of text encoded in CODING.  */
4335
4336 int
4337 decoding_buffer_size (coding, src_bytes)
4338      struct coding_system *coding;
4339      int src_bytes;
4340 {
4341   return (src_bytes * DECODING_BUFFER_MAG (coding)
4342           + CONVERSION_BUFFER_EXTRA_ROOM);
4343 }
4344
4345 /* Return maximum size (bytes) of a buffer enough for encoding
4346    SRC_BYTES of text to CODING.  */
4347
4348 int
4349 encoding_buffer_size (coding, src_bytes)
4350      struct coding_system *coding;
4351      int src_bytes;
4352 {
4353   int magnification;
4354
4355   if (coding->type == coding_type_ccl)
4356     magnification = coding->spec.ccl.encoder.buf_magnification;
4357   else if (CODING_REQUIRE_ENCODING (coding))
4358     magnification = 3;
4359   else
4360     magnification = 1;
4361
4362   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4363 }
4364
4365 /* Working buffer for code conversion.  */
4366 struct conversion_buffer
4367 {
4368   int size;                     /* size of data.  */
4369   int on_stack;                 /* 1 if allocated by alloca.  */
4370   unsigned char *data;
4371 };
4372
4373 /* Don't use alloca for allocating memory space larger than this, lest
4374    we overflow their stack.  */
4375 #define MAX_ALLOCA 16*1024
4376
4377 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4378 #define allocate_conversion_buffer(buf, len)            \
4379   do {                                                  \
4380     if (len < MAX_ALLOCA)                               \
4381       {                                                 \
4382         buf.data = (unsigned char *) alloca (len);      \
4383         buf.on_stack = 1;                               \
4384       }                                                 \
4385     else                                                \
4386       {                                                 \
4387         buf.data = (unsigned char *) xmalloc (len);     \
4388         buf.on_stack = 0;                               \
4389       }                                                 \
4390     buf.size = len;                                     \
4391   } while (0)
4392
4393 /* Double the allocated memory for *BUF.  */
4394 static void
4395 extend_conversion_buffer (buf)
4396      struct conversion_buffer *buf;
4397 {
4398   if (buf->on_stack)
4399     {
4400       unsigned char *save = buf->data;
4401       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4402       bcopy (save, buf->data, buf->size);
4403       buf->on_stack = 0;
4404     }
4405   else
4406     {
4407       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4408     }
4409   buf->size *= 2;
4410 }
4411
4412 /* Free the allocated memory for BUF if it is not on stack.  */
4413 static void
4414 free_conversion_buffer (buf)
4415      struct conversion_buffer *buf;
4416 {
4417   if (!buf->on_stack)
4418     xfree (buf->data);
4419 }
4420
4421 int
4422 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4423      struct coding_system *coding;
4424      unsigned char *source, *destination;
4425      int src_bytes, dst_bytes, encodep;
4426 {
4427   struct ccl_program *ccl
4428     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4429   unsigned char *dst = destination;
4430
4431   ccl->suppress_error = coding->suppress_error;
4432   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4433   if (encodep)
4434     {
4435       /* On encoding, EOL format is converted within ccl_driver.  For
4436          that, setup proper information in the structure CCL.  */
4437       ccl->eol_type = coding->eol_type;
4438       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4439         ccl->eol_type = CODING_EOL_LF;
4440       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4441     }
4442   ccl->multibyte = coding->src_multibyte;
4443   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4444     {
4445       /* Move carryover bytes to DESTINATION.  */
4446       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4447       while (*p)
4448         *dst++ = *p++;
4449       coding->spec.ccl.eight_bit_carryover[0] = 0;
4450       if (dst_bytes)
4451         dst_bytes -= dst - destination;
4452     }
4453
4454   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4455                                   &(coding->consumed))
4456                       + dst - destination);
4457
4458   if (encodep)
4459     {
4460       coding->produced_char = coding->produced;
4461       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4462     }
4463   else if (!ccl->eight_bit_control)
4464     {
4465       /* The produced bytes forms a valid multibyte sequence. */
4466       coding->produced_char
4467         = multibyte_chars_in_text (destination, coding->produced);
4468       coding->spec.ccl.eight_bit_carryover[0] = 0;
4469     }
4470   else
4471     {
4472       /* On decoding, the destination should always multibyte.  But,
4473          CCL program might have been generated an invalid multibyte
4474          sequence.  Here we make such a sequence valid as
4475          multibyte.  */
4476       int bytes
4477         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4478
4479       if ((coding->consumed < src_bytes
4480            || !ccl->last_block)
4481           && coding->produced >= 1
4482           && destination[coding->produced - 1] >= 0x80)
4483         {
4484           /* We should not convert the tailing 8-bit codes to
4485              multibyte form even if they doesn't form a valid
4486              multibyte sequence.  They may form a valid sequence in
4487              the next call.  */
4488           int carryover = 0;
4489
4490           if (destination[coding->produced - 1] < 0xA0)
4491             carryover = 1;
4492           else if (coding->produced >= 2)
4493             {
4494               if (destination[coding->produced - 2] >= 0x80)
4495                 {
4496                   if (destination[coding->produced - 2] < 0xA0)
4497                     carryover = 2;
4498                   else if (coding->produced >= 3
4499                            && destination[coding->produced - 3] >= 0x80
4500                            && destination[coding->produced - 3] < 0xA0)
4501                     carryover = 3;
4502                 }
4503             }
4504           if (carryover > 0)
4505             {
4506               BCOPY_SHORT (destination + coding->produced - carryover,
4507                            coding->spec.ccl.eight_bit_carryover,
4508                            carryover);
4509               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4510               coding->produced -= carryover;
4511             }
4512         }
4513       coding->produced = str_as_multibyte (destination, bytes,
4514                                            coding->produced,
4515                                            &(coding->produced_char));
4516     }
4517
4518   switch (ccl->status)
4519     {
4520     case CCL_STAT_SUSPEND_BY_SRC:
4521       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4522       break;
4523     case CCL_STAT_SUSPEND_BY_DST:
4524       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4525       break;
4526     case CCL_STAT_QUIT:
4527     case CCL_STAT_INVALID_CMD:
4528       coding->result = CODING_FINISH_INTERRUPT;
4529       break;
4530     default:
4531       coding->result = CODING_FINISH_NORMAL;
4532       break;
4533     }
4534   return coding->result;
4535 }
4536
4537 /* Decode EOL format of the text at PTR of BYTES length destructively
4538    according to CODING->eol_type.  This is called after the CCL
4539    program produced a decoded text at PTR.  If we do CRLF->LF
4540    conversion, update CODING->produced and CODING->produced_char.  */
4541
4542 static void
4543 decode_eol_post_ccl (coding, ptr, bytes)
4544      struct coding_system *coding;
4545      unsigned char *ptr;
4546      int bytes;
4547 {
4548   Lisp_Object val, saved_coding_symbol;
4549   unsigned char *pend = ptr + bytes;
4550   int dummy;
4551
4552   /* Remember the current coding system symbol.  We set it back when
4553      an inconsistent EOL is found so that `last-coding-system-used' is
4554      set to the coding system that doesn't specify EOL conversion.  */
4555   saved_coding_symbol = coding->symbol;
4556
4557   coding->spec.ccl.cr_carryover = 0;
4558   if (coding->eol_type == CODING_EOL_UNDECIDED)
4559     {
4560       /* Here, to avoid the call of setup_coding_system, we directly
4561          call detect_eol_type.  */
4562       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4563       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4564         coding->eol_type = CODING_EOL_LF;
4565       if (coding->eol_type != CODING_EOL_UNDECIDED)
4566         {
4567           val = Fget (coding->symbol, Qeol_type);
4568           if (VECTORP (val) && XVECTOR (val)->size == 3)
4569             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4570         }
4571       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4572     }
4573
4574   if (coding->eol_type == CODING_EOL_LF
4575       || coding->eol_type == CODING_EOL_UNDECIDED)
4576     {
4577       /* We have nothing to do.  */
4578       ptr = pend;
4579     }
4580   else if (coding->eol_type == CODING_EOL_CRLF)
4581     {
4582       unsigned char *pstart = ptr, *p = ptr;
4583
4584       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4585           && *(pend - 1) == '\r')
4586         {
4587           /* If the last character is CR, we can't handle it here
4588              because LF will be in the not-yet-decoded source text.
4589              Recorded that the CR is not yet processed.  */
4590           coding->spec.ccl.cr_carryover = 1;
4591           coding->produced--;
4592           coding->produced_char--;
4593           pend--;
4594         }
4595       while (ptr < pend)
4596         {
4597           if (*ptr == '\r')
4598             {
4599               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4600                 {
4601                   *p++ = '\n';
4602                   ptr += 2;
4603                 }
4604               else
4605                 {
4606                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4607                     goto undo_eol_conversion;
4608                   *p++ = *ptr++;
4609                 }
4610             }
4611           else if (*ptr == '\n'
4612                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4613             goto undo_eol_conversion;
4614           else
4615             *p++ = *ptr++;
4616           continue;
4617
4618         undo_eol_conversion:
4619           /* We have faced with inconsistent EOL format at PTR.
4620              Convert all LFs before PTR back to CRLFs.  */
4621           for (p--, ptr--; p >= pstart; p--)
4622             {
4623               if (*p == '\n')
4624                 *ptr-- = '\n', *ptr-- = '\r';
4625               else
4626                 *ptr-- = *p;
4627             }
4628           /*  If carryover is recorded, cancel it because we don't
4629               convert CRLF anymore.  */
4630           if (coding->spec.ccl.cr_carryover)
4631             {
4632               coding->spec.ccl.cr_carryover = 0;
4633               coding->produced++;
4634               coding->produced_char++;
4635               pend++;
4636             }
4637           p = ptr = pend;
4638           coding->eol_type = CODING_EOL_LF;
4639           coding->symbol = saved_coding_symbol;
4640         }
4641       if (p < pend)
4642         {
4643           /* As each two-byte sequence CRLF was converted to LF, (PEND
4644              - P) is the number of deleted characters.  */
4645           coding->produced -= pend - p;
4646           coding->produced_char -= pend - p;
4647         }
4648     }
4649   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4650     {
4651       unsigned char *p = ptr;
4652
4653       for (; ptr < pend; ptr++)
4654         {
4655           if (*ptr == '\r')
4656             *ptr = '\n';
4657           else if (*ptr == '\n'
4658                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4659             {
4660               for (; p < ptr; p++)
4661                 {
4662                   if (*p == '\n')
4663                     *p = '\r';
4664                 }
4665               ptr = pend;
4666               coding->eol_type = CODING_EOL_LF;
4667               coding->symbol = saved_coding_symbol;
4668             }
4669         }
4670     }
4671 }
4672
4673 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4674    decoding, it may detect coding system and format of end-of-line if
4675    those are not yet decided.  The source should be unibyte, the
4676    result is multibyte if CODING->dst_multibyte is nonzero, else
4677    unibyte.  */
4678
4679 int
4680 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4681      struct coding_system *coding;
4682      unsigned char *source, *destination;
4683      int src_bytes, dst_bytes;
4684 {
4685   if (coding->type == coding_type_undecided)
4686     detect_coding (coding, source, src_bytes);
4687
4688   if (coding->eol_type == CODING_EOL_UNDECIDED
4689       && coding->type != coding_type_ccl)
4690     {
4691       detect_eol (coding, source, src_bytes);
4692       /* We had better recover the original eol format if we
4693          encounter an inconsistent eol format while decoding.  */
4694       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4695     }
4696
4697   coding->produced = coding->produced_char = 0;
4698   coding->consumed = coding->consumed_char = 0;
4699   coding->errors = 0;
4700   coding->result = CODING_FINISH_NORMAL;
4701
4702   switch (coding->type)
4703     {
4704     case coding_type_sjis:
4705       decode_coding_sjis_big5 (coding, source, destination,
4706                                src_bytes, dst_bytes, 1);
4707       break;
4708
4709     case coding_type_iso2022:
4710       decode_coding_iso2022 (coding, source, destination,
4711                              src_bytes, dst_bytes);
4712       break;
4713
4714     case coding_type_big5:
4715       decode_coding_sjis_big5 (coding, source, destination,
4716                                src_bytes, dst_bytes, 0);
4717       break;
4718
4719     case coding_type_emacs_mule:
4720       decode_coding_emacs_mule (coding, source, destination,
4721                                 src_bytes, dst_bytes);
4722       break;
4723
4724     case coding_type_ccl:
4725       if (coding->spec.ccl.cr_carryover)
4726         {
4727           /* Set the CR which is not processed by the previous call of
4728              decode_eol_post_ccl in DESTINATION.  */
4729           *destination = '\r';
4730           coding->produced++;
4731           coding->produced_char++;
4732           dst_bytes--;
4733         }
4734       ccl_coding_driver (coding, source,
4735                          destination + coding->spec.ccl.cr_carryover,
4736                          src_bytes, dst_bytes, 0);
4737       if (coding->eol_type != CODING_EOL_LF)
4738         decode_eol_post_ccl (coding, destination, coding->produced);
4739       break;
4740
4741     default:
4742       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4743     }
4744
4745   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4746       && coding->mode & CODING_MODE_LAST_BLOCK
4747       && coding->consumed == src_bytes)
4748     coding->result = CODING_FINISH_NORMAL;
4749
4750   if (coding->mode & CODING_MODE_LAST_BLOCK
4751       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4752     {
4753       unsigned char *src = source + coding->consumed;
4754       unsigned char *dst = destination + coding->produced;
4755
4756       src_bytes -= coding->consumed;
4757       coding->errors++;
4758       if (COMPOSING_P (coding))
4759         DECODE_COMPOSITION_END ('1');
4760       while (src_bytes--)
4761         {
4762           int c = *src++;
4763           dst += CHAR_STRING (c, dst);
4764           coding->produced_char++;
4765         }
4766       coding->consumed = coding->consumed_char = src - source;
4767       coding->produced = dst - destination;
4768       coding->result = CODING_FINISH_NORMAL;
4769     }
4770
4771   if (!coding->dst_multibyte)
4772     {
4773       coding->produced = str_as_unibyte (destination, coding->produced);
4774       coding->produced_char = coding->produced;
4775     }
4776
4777   return coding->result;
4778 }
4779
4780 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4781    multibyteness of the source is CODING->src_multibyte, the
4782    multibyteness of the result is always unibyte.  */
4783
4784 int
4785 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4786      struct coding_system *coding;
4787      unsigned char *source, *destination;
4788      int src_bytes, dst_bytes;
4789 {
4790   coding->produced = coding->produced_char = 0;
4791   coding->consumed = coding->consumed_char = 0;
4792   coding->errors = 0;
4793   coding->result = CODING_FINISH_NORMAL;
4794
4795   switch (coding->type)
4796     {
4797     case coding_type_sjis:
4798       encode_coding_sjis_big5 (coding, source, destination,
4799                                src_bytes, dst_bytes, 1);
4800       break;
4801
4802     case coding_type_iso2022:
4803       encode_coding_iso2022 (coding, source, destination,
4804                              src_bytes, dst_bytes);
4805       break;
4806
4807     case coding_type_big5:
4808       encode_coding_sjis_big5 (coding, source, destination,
4809                                src_bytes, dst_bytes, 0);
4810       break;
4811
4812     case coding_type_emacs_mule:
4813       encode_coding_emacs_mule (coding, source, destination,
4814                                 src_bytes, dst_bytes);
4815       break;
4816
4817     case coding_type_ccl:
4818       ccl_coding_driver (coding, source, destination,
4819                          src_bytes, dst_bytes, 1);
4820       break;
4821
4822     default:
4823       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4824     }
4825
4826   if (coding->mode & CODING_MODE_LAST_BLOCK
4827       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4828     {
4829       unsigned char *src = source + coding->consumed;
4830       unsigned char *dst = destination + coding->produced;
4831
4832       if (coding->type == coding_type_iso2022)
4833         ENCODE_RESET_PLANE_AND_REGISTER;
4834       if (COMPOSING_P (coding))
4835         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4836       if (coding->consumed < src_bytes)
4837         {
4838           int len = src_bytes - coding->consumed;
4839
4840           BCOPY_SHORT (src, dst, len);
4841           if (coding->src_multibyte)
4842             len = str_as_unibyte (dst, len);
4843           dst += len;
4844           coding->consumed = src_bytes;
4845         }
4846       coding->produced = coding->produced_char = dst - destination;
4847       coding->result = CODING_FINISH_NORMAL;
4848     }
4849
4850   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4851       && coding->consumed == src_bytes)
4852     coding->result = CODING_FINISH_NORMAL;
4853
4854   return coding->result;
4855 }
4856
4857 /* Scan text in the region between *BEG and *END (byte positions),
4858    skip characters which we don't have to decode by coding system
4859    CODING at the head and tail, then set *BEG and *END to the region
4860    of the text we actually have to convert.  The caller should move
4861    the gap out of the region in advance if the region is from a
4862    buffer.
4863
4864    If STR is not NULL, *BEG and *END are indices into STR.  */
4865
4866 static void
4867 shrink_decoding_region (beg, end, coding, str)
4868      int *beg, *end;
4869      struct coding_system *coding;
4870      unsigned char *str;
4871 {
4872   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4873   int eol_conversion;
4874   Lisp_Object translation_table;
4875
4876   if (coding->type == coding_type_ccl
4877       || coding->type == coding_type_undecided
4878       || coding->eol_type != CODING_EOL_LF
4879       || !NILP (coding->post_read_conversion)
4880       || coding->composing != COMPOSITION_DISABLED)
4881     {
4882       /* We can't skip any data.  */
4883       return;
4884     }
4885   if (coding->type == coding_type_no_conversion
4886       || coding->type == coding_type_raw_text
4887       || coding->type == coding_type_emacs_mule)
4888     {
4889       /* We need no conversion, but don't have to skip any data here.
4890          Decoding routine handles them effectively anyway.  */
4891       return;
4892     }
4893
4894   translation_table = coding->translation_table_for_decode;
4895   if (NILP (translation_table) && !NILP (Venable_character_translation))
4896     translation_table = Vstandard_translation_table_for_decode;
4897   if (CHAR_TABLE_P (translation_table))
4898     {
4899       int i;
4900       for (i = 0; i < 128; i++)
4901         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4902           break;
4903       if (i < 128)
4904         /* Some ASCII character should be translated.  We give up
4905            shrinking.  */
4906         return;
4907     }
4908
4909   if (coding->heading_ascii >= 0)
4910     /* Detection routine has already found how much we can skip at the
4911        head.  */
4912     *beg += coding->heading_ascii;
4913
4914   if (str)
4915     {
4916       begp_orig = begp = str + *beg;
4917       endp_orig = endp = str + *end;
4918     }
4919   else
4920     {
4921       begp_orig = begp = BYTE_POS_ADDR (*beg);
4922       endp_orig = endp = begp + *end - *beg;
4923     }
4924
4925   eol_conversion = (coding->eol_type == CODING_EOL_CR
4926                     || coding->eol_type == CODING_EOL_CRLF);
4927
4928   switch (coding->type)
4929     {
4930     case coding_type_sjis:
4931     case coding_type_big5:
4932       /* We can skip all ASCII characters at the head.  */
4933       if (coding->heading_ascii < 0)
4934         {
4935           if (eol_conversion)
4936             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4937           else
4938             while (begp < endp && *begp < 0x80) begp++;
4939         }
4940       /* We can skip all ASCII characters at the tail except for the
4941          second byte of SJIS or BIG5 code.  */
4942       if (eol_conversion)
4943         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4944       else
4945         while (begp < endp && endp[-1] < 0x80) endp--;
4946       /* Do not consider LF as ascii if preceded by CR, since that
4947          confuses eol decoding. */
4948       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4949         endp++;
4950       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4951         endp++;
4952       break;
4953
4954     case coding_type_iso2022:
4955       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4956         /* We can't skip any data.  */
4957         break;
4958       if (coding->heading_ascii < 0)
4959         {
4960           /* We can skip all ASCII characters at the head except for a
4961              few control codes.  */
4962           while (begp < endp && (c = *begp) < 0x80
4963                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4964                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4965                  && (!eol_conversion || c != ISO_CODE_LF))
4966             begp++;
4967         }
4968       switch (coding->category_idx)
4969         {
4970         case CODING_CATEGORY_IDX_ISO_8_1:
4971         case CODING_CATEGORY_IDX_ISO_8_2:
4972           /* We can skip all ASCII characters at the tail.  */
4973           if (eol_conversion)
4974             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4975           else
4976             while (begp < endp && endp[-1] < 0x80) endp--;
4977           /* Do not consider LF as ascii if preceded by CR, since that
4978              confuses eol decoding. */
4979           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4980             endp++;
4981           break;
4982
4983         case CODING_CATEGORY_IDX_ISO_7:
4984         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4985           {
4986             /* We can skip all characters at the tail except for 8-bit
4987                codes and ESC and the following 2-byte at the tail.  */
4988             unsigned char *eight_bit = NULL;
4989
4990             if (eol_conversion)
4991               while (begp < endp
4992                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4993                 {
4994                   if (!eight_bit && c & 0x80) eight_bit = endp;
4995                   endp--;
4996                 }
4997             else
4998               while (begp < endp
4999                      && (c = endp[-1]) != ISO_CODE_ESC)
5000                 {
5001                   if (!eight_bit && c & 0x80) eight_bit = endp;
5002                   endp--;
5003                 }
5004             /* Do not consider LF as ascii if preceded by CR, since that
5005                confuses eol decoding. */
5006             if (begp < endp && endp < endp_orig
5007                 && endp[-1] == '\r' && endp[0] == '\n')
5008               endp++;
5009             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5010               {
5011                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5012                   /* This is an ASCII designation sequence.  We can
5013                      surely skip the tail.  But, if we have
5014                      encountered an 8-bit code, skip only the codes
5015                      after that.  */
5016                   endp = eight_bit ? eight_bit : endp + 2;
5017                 else
5018                   /* Hmmm, we can't skip the tail.  */
5019                   endp = endp_orig;
5020               }
5021             else if (eight_bit)
5022               endp = eight_bit;
5023           }
5024         }
5025       break;
5026
5027     default:
5028       abort ();
5029     }
5030   *beg += begp - begp_orig;
5031   *end += endp - endp_orig;
5032   return;
5033 }
5034
5035 /* Like shrink_decoding_region but for encoding.  */
5036
5037 static void
5038 shrink_encoding_region (beg, end, coding, str)
5039      int *beg, *end;
5040      struct coding_system *coding;
5041      unsigned char *str;
5042 {
5043   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5044   int eol_conversion;
5045   Lisp_Object translation_table;
5046
5047   if (coding->type == coding_type_ccl
5048       || coding->eol_type == CODING_EOL_CRLF
5049       || coding->eol_type == CODING_EOL_CR
5050       || (coding->cmp_data && coding->cmp_data->used > 0))
5051     {
5052       /* We can't skip any data.  */
5053       return;
5054     }
5055   if (coding->type == coding_type_no_conversion
5056       || coding->type == coding_type_raw_text
5057       || coding->type == coding_type_emacs_mule
5058       || coding->type == coding_type_undecided)
5059     {
5060       /* We need no conversion, but don't have to skip any data here.
5061          Encoding routine handles them effectively anyway.  */
5062       return;
5063     }
5064
5065   translation_table = coding->translation_table_for_encode;
5066   if (NILP (translation_table) && !NILP (Venable_character_translation))
5067     translation_table = Vstandard_translation_table_for_encode;
5068   if (CHAR_TABLE_P (translation_table))
5069     {
5070       int i;
5071       for (i = 0; i < 128; i++)
5072         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5073           break;
5074       if (i < 128)
5075         /* Some ASCII character should be translated.  We give up
5076            shrinking.  */
5077         return;
5078     }
5079
5080   if (str)
5081     {
5082       begp_orig = begp = str + *beg;
5083       endp_orig = endp = str + *end;
5084     }
5085   else
5086     {
5087       begp_orig = begp = BYTE_POS_ADDR (*beg);
5088       endp_orig = endp = begp + *end - *beg;
5089     }
5090
5091   eol_conversion = (coding->eol_type == CODING_EOL_CR
5092                     || coding->eol_type == CODING_EOL_CRLF);
5093
5094   /* Here, we don't have to check coding->pre_write_conversion because
5095      the caller is expected to have handled it already.  */
5096   switch (coding->type)
5097     {
5098     case coding_type_iso2022:
5099       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5100         /* We can't skip any data.  */
5101         break;
5102       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5103         {
5104           unsigned char *bol = begp;
5105           while (begp < endp && *begp < 0x80)
5106             {
5107               begp++;
5108               if (begp[-1] == '\n')
5109                 bol = begp;
5110             }
5111           begp = bol;
5112           goto label_skip_tail;
5113         }
5114       /* fall down ... */
5115
5116     case coding_type_sjis:
5117     case coding_type_big5:
5118       /* We can skip all ASCII characters at the head and tail.  */
5119       if (eol_conversion)
5120         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5121       else
5122         while (begp < endp && *begp < 0x80) begp++;
5123     label_skip_tail:
5124       if (eol_conversion)
5125         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5126       else
5127         while (begp < endp && *(endp - 1) < 0x80) endp--;
5128       break;
5129
5130     default:
5131       abort ();
5132     }
5133
5134   *beg += begp - begp_orig;
5135   *end += endp - endp_orig;
5136   return;
5137 }
5138
5139 /* As shrinking conversion region requires some overhead, we don't try
5140    shrinking if the length of conversion region is less than this
5141    value.  */
5142 static int shrink_conversion_region_threshhold = 1024;
5143
5144 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5145   do {                                                                  \
5146     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5147       {                                                                 \
5148         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5149         else shrink_decoding_region (beg, end, coding, str);            \
5150       }                                                                 \
5151   } while (0)
5152
5153 static Lisp_Object
5154 code_convert_region_unwind (dummy)
5155      Lisp_Object dummy;
5156 {
5157   inhibit_pre_post_conversion = 0;
5158   return Qnil;
5159 }
5160
5161 /* Store information about all compositions in the range FROM and TO
5162    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5163    buffer or a string, defaults to the current buffer.  */
5164
5165 void
5166 coding_save_composition (coding, from, to, obj)
5167      struct coding_system *coding;
5168      int from, to;
5169      Lisp_Object obj;
5170 {
5171   Lisp_Object prop;
5172   int start, end;
5173
5174   if (coding->composing == COMPOSITION_DISABLED)
5175     return;
5176   if (!coding->cmp_data)
5177     coding_allocate_composition_data (coding, from);
5178   if (!find_composition (from, to, &start, &end, &prop, obj)
5179       || end > to)
5180     return;
5181   if (start < from
5182       && (!find_composition (end, to, &start, &end, &prop, obj)
5183           || end > to))
5184     return;
5185   coding->composing = COMPOSITION_NO;
5186   do
5187     {
5188       if (COMPOSITION_VALID_P (start, end, prop))
5189         {
5190           enum composition_method method = COMPOSITION_METHOD (prop);
5191           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5192               >= COMPOSITION_DATA_SIZE)
5193             coding_allocate_composition_data (coding, from);
5194           /* For relative composition, we remember start and end
5195              positions, for the other compositions, we also remember
5196              components.  */
5197           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5198           if (method != COMPOSITION_RELATIVE)
5199             {
5200               /* We must store a*/
5201               Lisp_Object val, ch;
5202
5203               val = COMPOSITION_COMPONENTS (prop);
5204               if (CONSP (val))
5205                 while (CONSP (val))
5206                   {
5207                     ch = XCAR (val), val = XCDR (val);
5208                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5209                   }
5210               else if (VECTORP (val) || STRINGP (val))
5211                 {
5212                   int len = (VECTORP (val)
5213                              ? XVECTOR (val)->size : XSTRING (val)->size);
5214                   int i;
5215                   for (i = 0; i < len; i++)
5216                     {
5217                       ch = (STRINGP (val)
5218                             ? Faref (val, make_number (i))
5219                             : XVECTOR (val)->contents[i]);
5220                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5221                     }
5222                 }
5223               else              /* INTEGERP (val) */
5224                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5225             }
5226           CODING_ADD_COMPOSITION_END (coding, end - from);
5227         }
5228       start = end;
5229     }
5230   while (start < to
5231          && find_composition (start, to, &start, &end, &prop, obj)
5232          && end <= to);
5233
5234   /* Make coding->cmp_data point to the first memory block.  */
5235   while (coding->cmp_data->prev)
5236     coding->cmp_data = coding->cmp_data->prev;
5237   coding->cmp_data_start = 0;
5238 }
5239
5240 /* Reflect the saved information about compositions to OBJ.
5241    CODING->cmp_data points to a memory block for the information.  OBJ
5242    is a buffer or a string, defaults to the current buffer.  */
5243
5244 void
5245 coding_restore_composition (coding, obj)
5246      struct coding_system *coding;
5247      Lisp_Object obj;
5248 {
5249   struct composition_data *cmp_data = coding->cmp_data;
5250
5251   if (!cmp_data)
5252     return;
5253
5254   while (cmp_data->prev)
5255     cmp_data = cmp_data->prev;
5256
5257   while (cmp_data)
5258     {
5259       int i;
5260
5261       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5262            i += cmp_data->data[i])
5263         {
5264           int *data = cmp_data->data + i;
5265           enum composition_method method = (enum composition_method) data[3];
5266           Lisp_Object components;
5267
5268           if (method == COMPOSITION_RELATIVE)
5269             components = Qnil;
5270           else
5271             {
5272               int len = data[0] - 4, j;
5273               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5274
5275               for (j = 0; j < len; j++)
5276                 args[j] = make_number (data[4 + j]);
5277               components = (method == COMPOSITION_WITH_ALTCHARS
5278                             ? Fstring (len, args) : Fvector (len, args));
5279             }
5280           compose_text (data[1], data[2], components, Qnil, obj);
5281         }
5282       cmp_data = cmp_data->next;
5283     }
5284 }
5285
5286 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5287    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5288    coding system CODING, and return the status code of code conversion
5289    (currently, this value has no meaning).
5290
5291    How many characters (and bytes) are converted to how many
5292    characters (and bytes) are recorded in members of the structure
5293    CODING.
5294
5295    If REPLACE is nonzero, we do various things as if the original text
5296    is deleted and a new text is inserted.  See the comments in
5297    replace_range (insdel.c) to know what we are doing.
5298
5299    If REPLACE is zero, it is assumed that the source text is unibyte.
5300    Otherwise, it is assumed that the source text is multibyte.  */
5301
5302 int
5303 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5304      int from, from_byte, to, to_byte, encodep, replace;
5305      struct coding_system *coding;
5306 {
5307   int len = to - from, len_byte = to_byte - from_byte;
5308   int nchars_del = 0, nbytes_del = 0;
5309   int require, inserted, inserted_byte;
5310   int head_skip, tail_skip, total_skip = 0;
5311   Lisp_Object saved_coding_symbol;
5312   int first = 1;
5313   unsigned char *src, *dst;
5314   Lisp_Object deletion;
5315   int orig_point = PT, orig_len = len;
5316   int prev_Z;
5317   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5318
5319   deletion = Qnil;
5320   saved_coding_symbol = coding->symbol;
5321
5322   if (from < PT && PT < to)
5323     {
5324       TEMP_SET_PT_BOTH (from, from_byte);
5325       orig_point = from;
5326     }
5327
5328   if (replace)
5329     {
5330       int saved_from = from;
5331       int saved_inhibit_modification_hooks;
5332
5333       prepare_to_modify_buffer (from, to, &from);
5334       if (saved_from != from)
5335         {
5336           to = from + len;
5337           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5338           len_byte = to_byte - from_byte;
5339         }
5340
5341       /* The code conversion routine can not preserve text properties
5342          for now.  So, we must remove all text properties in the
5343          region.  Here, we must suppress all modification hooks.  */
5344       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5345       inhibit_modification_hooks = 1;
5346       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5347       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5348     }
5349
5350   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5351     {
5352       /* We must detect encoding of text and eol format.  */
5353
5354       if (from < GPT && to > GPT)
5355         move_gap_both (from, from_byte);
5356       if (coding->type == coding_type_undecided)
5357         {
5358           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5359           if (coding->type == coding_type_undecided)
5360             {
5361               /* It seems that the text contains only ASCII, but we
5362                  should not leave it undecided because the deeper
5363                  decoding routine (decode_coding) tries to detect the
5364                  encodings again in vain.  */
5365               coding->type = coding_type_emacs_mule;
5366               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5367               /* As emacs-mule decoder will handle composition, we
5368                  need this setting to allocate coding->cmp_data
5369                  later.  */
5370               coding->composing = COMPOSITION_NO;
5371             }
5372         }
5373       if (coding->eol_type == CODING_EOL_UNDECIDED
5374           && coding->type != coding_type_ccl)
5375         {
5376           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5377           if (coding->eol_type == CODING_EOL_UNDECIDED)
5378             coding->eol_type = CODING_EOL_LF;
5379           /* We had better recover the original eol format if we
5380              encounter an inconsistent eol format while decoding.  */
5381           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5382         }
5383     }
5384
5385   /* Now we convert the text.  */
5386
5387   /* For encoding, we must process pre-write-conversion in advance.  */
5388   if (! inhibit_pre_post_conversion
5389       && encodep
5390       && SYMBOLP (coding->pre_write_conversion)
5391       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5392     {
5393       /* The function in pre-write-conversion may put a new text in a
5394          new buffer.  */
5395       struct buffer *prev = current_buffer;
5396       Lisp_Object new;
5397
5398       record_unwind_protect (code_convert_region_unwind, Qnil);
5399       /* We should not call any more pre-write/post-read-conversion
5400          functions while this pre-write-conversion is running.  */
5401       inhibit_pre_post_conversion = 1;
5402       call2 (coding->pre_write_conversion,
5403              make_number (from), make_number (to));
5404       inhibit_pre_post_conversion = 0;
5405       /* Discard the unwind protect.  */
5406       specpdl_ptr--;
5407
5408       if (current_buffer != prev)
5409         {
5410           len = ZV - BEGV;
5411           new = Fcurrent_buffer ();
5412           set_buffer_internal_1 (prev);
5413           del_range_2 (from, from_byte, to, to_byte, 0);
5414           TEMP_SET_PT_BOTH (from, from_byte);
5415           insert_from_buffer (XBUFFER (new), 1, len, 0);
5416           Fkill_buffer (new);
5417           if (orig_point >= to)
5418             orig_point += len - orig_len;
5419           else if (orig_point > from)
5420             orig_point = from;
5421           orig_len = len;
5422           to = from + len;
5423           from_byte = CHAR_TO_BYTE (from);
5424           to_byte = CHAR_TO_BYTE (to);
5425           len_byte = to_byte - from_byte;
5426           TEMP_SET_PT_BOTH (from, from_byte);
5427         }
5428     }
5429
5430   if (replace)
5431     {
5432       if (! EQ (current_buffer->undo_list, Qt))
5433         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5434       else
5435         {
5436           nchars_del = to - from;
5437           nbytes_del = to_byte - from_byte;
5438         }
5439     }
5440
5441   if (coding->composing != COMPOSITION_DISABLED)
5442     {
5443       if (encodep)
5444         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5445       else
5446         coding_allocate_composition_data (coding, from);
5447     }
5448
5449   /* Try to skip the heading and tailing ASCIIs.  */
5450   if (coding->type != coding_type_ccl)
5451     {
5452       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5453
5454       if (from < GPT && GPT < to)
5455         move_gap_both (from, from_byte);
5456       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5457       if (from_byte == to_byte
5458           && (encodep || NILP (coding->post_read_conversion))
5459           && ! CODING_REQUIRE_FLUSHING (coding))
5460         {
5461           coding->produced = len_byte;
5462           coding->produced_char = len;
5463           if (!replace)
5464             /* We must record and adjust for this new text now.  */
5465             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5466           return 0;
5467         }
5468
5469       head_skip = from_byte - from_byte_orig;
5470       tail_skip = to_byte_orig - to_byte;
5471       total_skip = head_skip + tail_skip;
5472       from += head_skip;
5473       to -= tail_skip;
5474       len -= total_skip; len_byte -= total_skip;
5475     }
5476
5477   /* For conversion, we must put the gap before the text in addition to
5478      making the gap larger for efficient decoding.  The required gap
5479      size starts from 2000 which is the magic number used in make_gap.
5480      But, after one batch of conversion, it will be incremented if we
5481      find that it is not enough .  */
5482   require = 2000;
5483
5484   if (GAP_SIZE  < require)
5485     make_gap (require - GAP_SIZE);
5486   move_gap_both (from, from_byte);
5487
5488   inserted = inserted_byte = 0;
5489
5490   GAP_SIZE += len_byte;
5491   ZV -= len;
5492   Z -= len;
5493   ZV_BYTE -= len_byte;
5494   Z_BYTE -= len_byte;
5495
5496   if (GPT - BEG < BEG_UNCHANGED)
5497     BEG_UNCHANGED = GPT - BEG;
5498   if (Z - GPT < END_UNCHANGED)
5499     END_UNCHANGED = Z - GPT;
5500
5501   if (!encodep && coding->src_multibyte)
5502     {
5503       /* Decoding routines expects that the source text is unibyte.
5504          We must convert 8-bit characters of multibyte form to
5505          unibyte.  */
5506       int len_byte_orig = len_byte;
5507       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5508       if (len_byte < len_byte_orig)
5509         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5510                     len_byte);
5511       coding->src_multibyte = 0;
5512     }
5513
5514   for (;;)
5515     {
5516       int result;
5517
5518       /* The buffer memory is now:
5519          +--------+converted-text+---------+-------original-text-------+---+
5520          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5521                   |<---------------------- GAP ----------------------->|  */
5522       src = GAP_END_ADDR - len_byte;
5523       dst = GPT_ADDR + inserted_byte;
5524
5525       if (encodep)
5526         result = encode_coding (coding, src, dst, len_byte, 0);
5527       else
5528         {
5529           if (coding->composing != COMPOSITION_DISABLED)
5530             coding->cmp_data->char_offset = from + inserted;
5531           result = decode_coding (coding, src, dst, len_byte, 0);
5532         }
5533
5534       /* The buffer memory is now:
5535          +--------+-------converted-text----+--+------original-text----+---+
5536          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5537                   |<---------------------- GAP ----------------------->|  */
5538
5539       inserted += coding->produced_char;
5540       inserted_byte += coding->produced;
5541       len_byte -= coding->consumed;
5542
5543       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5544         {
5545           coding_allocate_composition_data (coding, from + inserted);
5546           continue;
5547         }
5548
5549       src += coding->consumed;
5550       dst += coding->produced;
5551
5552       if (result == CODING_FINISH_NORMAL)
5553         {
5554           src += len_byte;
5555           break;
5556         }
5557       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5558         {
5559           unsigned char *pend = dst, *p = pend - inserted_byte;
5560           Lisp_Object eol_type;
5561
5562           /* Encode LFs back to the original eol format (CR or CRLF).  */
5563           if (coding->eol_type == CODING_EOL_CR)
5564             {
5565               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5566             }
5567           else
5568             {
5569               int count = 0;
5570
5571               while (p < pend) if (*p++ == '\n') count++;
5572               if (src - dst < count)
5573                 {
5574                   /* We don't have sufficient room for encoding LFs
5575                      back to CRLF.  We must record converted and
5576                      not-yet-converted text back to the buffer
5577                      content, enlarge the gap, then record them out of
5578                      the buffer contents again.  */
5579                   int add = len_byte + inserted_byte;
5580
5581                   GAP_SIZE -= add;
5582                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5583                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5584                   make_gap (count - GAP_SIZE);
5585                   GAP_SIZE += add;
5586                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5587                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5588                   /* Don't forget to update SRC, DST, and PEND.  */
5589                   src = GAP_END_ADDR - len_byte;
5590                   dst = GPT_ADDR + inserted_byte;
5591                   pend = dst;
5592                 }
5593               inserted += count;
5594               inserted_byte += count;
5595               coding->produced += count;
5596               p = dst = pend + count;
5597               while (count)
5598                 {
5599                   *--p = *--pend;
5600                   if (*p == '\n') count--, *--p = '\r';
5601                 }
5602             }
5603
5604           /* Suppress eol-format conversion in the further conversion.  */
5605           coding->eol_type = CODING_EOL_LF;
5606
5607           /* Set the coding system symbol to that for Unix-like EOL.  */
5608           eol_type = Fget (saved_coding_symbol, Qeol_type);
5609           if (VECTORP (eol_type)
5610               && XVECTOR (eol_type)->size == 3
5611               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5612             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5613           else
5614             coding->symbol = saved_coding_symbol;
5615
5616           continue;
5617         }
5618       if (len_byte <= 0)
5619         {
5620           if (coding->type != coding_type_ccl
5621               || coding->mode & CODING_MODE_LAST_BLOCK)
5622             break;
5623           coding->mode |= CODING_MODE_LAST_BLOCK;
5624           continue;
5625         }
5626       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5627         {
5628           /* The source text ends in invalid codes.  Let's just
5629              make them valid buffer contents, and finish conversion.  */
5630           if (multibyte_p)
5631             {
5632               unsigned char *start = dst;
5633
5634               inserted += len_byte;
5635               while (len_byte--)
5636                 {
5637                   int c = *src++;
5638                   dst += CHAR_STRING (c, dst);
5639                 }
5640
5641               inserted_byte += dst - start;
5642             }
5643           else
5644             {
5645               inserted += len_byte;
5646               inserted_byte += len_byte;
5647               while (len_byte--)
5648                 *dst++ = *src++;
5649             }
5650           break;
5651         }
5652       if (result == CODING_FINISH_INTERRUPT)
5653         {
5654           /* The conversion procedure was interrupted by a user.  */
5655           break;
5656         }
5657       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5658       if (coding->consumed < 1)
5659         {
5660           /* It's quite strange to require more memory without
5661              consuming any bytes.  Perhaps CCL program bug.  */
5662           break;
5663         }
5664       if (first)
5665         {
5666           /* We have just done the first batch of conversion which was
5667              stopped because of insufficient gap.  Let's reconsider the
5668              required gap size (i.e. SRT - DST) now.
5669
5670              We have converted ORIG bytes (== coding->consumed) into
5671              NEW bytes (coding->produced).  To convert the remaining
5672              LEN bytes, we may need REQUIRE bytes of gap, where:
5673                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5674                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5675              Here, we are sure that NEW >= ORIG.  */
5676           float ratio = coding->produced - coding->consumed;
5677           ratio /= coding->consumed;
5678           require = len_byte * ratio;
5679           first = 0;
5680         }
5681       if ((src - dst) < (require + 2000))
5682         {
5683           /* See the comment above the previous call of make_gap.  */
5684           int add = len_byte + inserted_byte;
5685
5686           GAP_SIZE -= add;
5687           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5688           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5689           make_gap (require + 2000);
5690           GAP_SIZE += add;
5691           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5692           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5693         }
5694     }
5695   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5696
5697   if (encodep && coding->dst_multibyte)
5698     {
5699       /* The output is unibyte.  We must convert 8-bit characters to
5700          multibyte form.  */
5701       if (inserted_byte * 2 > GAP_SIZE)
5702         {
5703           GAP_SIZE -= inserted_byte;
5704           ZV += inserted_byte; Z += inserted_byte;
5705           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5706           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5707           make_gap (inserted_byte - GAP_SIZE);
5708           GAP_SIZE += inserted_byte;
5709           ZV -= inserted_byte; Z -= inserted_byte;
5710           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5711           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5712         }
5713       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5714     }
5715
5716   /* If we shrank the conversion area, adjust it now.  */
5717   if (total_skip > 0)
5718     {
5719       if (tail_skip > 0)
5720         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5721       inserted += total_skip; inserted_byte += total_skip;
5722       GAP_SIZE += total_skip;
5723       GPT -= head_skip; GPT_BYTE -= head_skip;
5724       ZV -= total_skip; ZV_BYTE -= total_skip;
5725       Z -= total_skip; Z_BYTE -= total_skip;
5726       from -= head_skip; from_byte -= head_skip;
5727       to += tail_skip; to_byte += tail_skip;
5728     }
5729
5730   prev_Z = Z;
5731   if (! EQ (current_buffer->undo_list, Qt))
5732     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5733   else
5734     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5735                                  inserted, inserted_byte);
5736   inserted = Z - prev_Z;
5737
5738   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5739     coding_restore_composition (coding, Fcurrent_buffer ());
5740   coding_free_composition_data (coding);
5741
5742   if (! inhibit_pre_post_conversion
5743       && ! encodep && ! NILP (coding->post_read_conversion))
5744     {
5745       Lisp_Object val;
5746
5747       if (from != PT)
5748         TEMP_SET_PT_BOTH (from, from_byte);
5749       prev_Z = Z;
5750       record_unwind_protect (code_convert_region_unwind, Qnil);
5751       /* We should not call any more pre-write/post-read-conversion
5752          functions while this post-read-conversion is running.  */
5753       inhibit_pre_post_conversion = 1;
5754       val = call1 (coding->post_read_conversion, make_number (inserted));
5755       inhibit_pre_post_conversion = 0;
5756       /* Discard the unwind protect.  */
5757       specpdl_ptr--;
5758       CHECK_NUMBER (val);
5759       inserted += Z - prev_Z;
5760     }
5761
5762   if (orig_point >= from)
5763     {
5764       if (orig_point >= from + orig_len)
5765         orig_point += inserted - orig_len;
5766       else
5767         orig_point = from;
5768       TEMP_SET_PT (orig_point);
5769     }
5770
5771   if (replace)
5772     {
5773       signal_after_change (from, to - from, inserted);
5774       update_compositions (from, from + inserted, CHECK_BORDER);
5775     }
5776
5777   {
5778     coding->consumed = to_byte - from_byte;
5779     coding->consumed_char = to - from;
5780     coding->produced = inserted_byte;
5781     coding->produced_char = inserted;
5782   }
5783
5784   return 0;
5785 }
5786
5787 Lisp_Object
5788 run_pre_post_conversion_on_str (str, coding, encodep)
5789      Lisp_Object str;
5790      struct coding_system *coding;
5791      int encodep;
5792 {
5793   int count = specpdl_ptr - specpdl;
5794   struct gcpro gcpro1;
5795   int multibyte = STRING_MULTIBYTE (str);
5796
5797   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5798   record_unwind_protect (code_convert_region_unwind, Qnil);
5799   GCPRO1 (str);
5800   temp_output_buffer_setup (" *code-converting-work*");
5801   set_buffer_internal (XBUFFER (Vstandard_output));
5802   /* We must insert the contents of STR as is without
5803      unibyte<->multibyte conversion.  For that, we adjust the
5804      multibyteness of the working buffer to that of STR.  */
5805   Ferase_buffer ();
5806   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5807   insert_from_string (str, 0, 0,
5808                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5809   UNGCPRO;
5810   inhibit_pre_post_conversion = 1;
5811   if (encodep)
5812     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5813   else
5814     {
5815       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5816       call1 (coding->post_read_conversion, make_number (Z - BEG));
5817     }
5818   inhibit_pre_post_conversion = 0;
5819   str = make_buffer_string (BEG, Z, 1);
5820   return unbind_to (count, str);
5821 }
5822
5823 Lisp_Object
5824 decode_coding_string (str, coding, nocopy)
5825      Lisp_Object str;
5826      struct coding_system *coding;
5827      int nocopy;
5828 {
5829   int len;
5830   struct conversion_buffer buf;
5831   int from, to_byte;
5832   Lisp_Object saved_coding_symbol;
5833   int result;
5834   int require_decoding;
5835   int shrinked_bytes = 0;
5836   Lisp_Object newstr;
5837   int consumed, consumed_char, produced, produced_char;
5838
5839   from = 0;
5840   to_byte = STRING_BYTES (XSTRING (str));
5841
5842   saved_coding_symbol = coding->symbol;
5843   coding->src_multibyte = STRING_MULTIBYTE (str);
5844   coding->dst_multibyte = 1;
5845   if (CODING_REQUIRE_DETECTION (coding))
5846     {
5847       /* See the comments in code_convert_region.  */
5848       if (coding->type == coding_type_undecided)
5849         {
5850           detect_coding (coding, XSTRING (str)->data, to_byte);
5851           if (coding->type == coding_type_undecided)
5852             {
5853               coding->type = coding_type_emacs_mule;
5854               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5855               /* As emacs-mule decoder will handle composition, we
5856                  need this setting to allocate coding->cmp_data
5857                  later.  */
5858               coding->composing = COMPOSITION_NO;
5859             }
5860         }
5861       if (coding->eol_type == CODING_EOL_UNDECIDED
5862           && coding->type != coding_type_ccl)
5863         {
5864           saved_coding_symbol = coding->symbol;
5865           detect_eol (coding, XSTRING (str)->data, to_byte);
5866           if (coding->eol_type == CODING_EOL_UNDECIDED)
5867             coding->eol_type = CODING_EOL_LF;
5868           /* We had better recover the original eol format if we
5869              encounter an inconsistent eol format while decoding.  */
5870           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5871         }
5872     }
5873
5874   if (coding->type == coding_type_no_conversion
5875       || coding->type == coding_type_raw_text)
5876     coding->dst_multibyte = 0;
5877
5878   require_decoding = CODING_REQUIRE_DECODING (coding);
5879
5880   if (STRING_MULTIBYTE (str))
5881     {
5882       /* Decoding routines expect the source text to be unibyte.  */
5883       str = Fstring_as_unibyte (str);
5884       to_byte = STRING_BYTES (XSTRING (str));
5885       nocopy = 1;
5886       coding->src_multibyte = 0;
5887     }
5888
5889   /* Try to skip the heading and tailing ASCIIs.  */
5890   if (require_decoding && coding->type != coding_type_ccl)
5891     {
5892       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5893                                 0);
5894       if (from == to_byte)
5895         require_decoding = 0;
5896       shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte);
5897     }
5898
5899   if (!require_decoding)
5900     {
5901       coding->consumed = STRING_BYTES (XSTRING (str));
5902       coding->consumed_char = XSTRING (str)->size;
5903       if (coding->dst_multibyte)
5904         {
5905           str = Fstring_as_multibyte (str);
5906           nocopy = 1;
5907         }
5908       coding->produced = STRING_BYTES (XSTRING (str));
5909       coding->produced_char = XSTRING (str)->size;
5910       return (nocopy ? str : Fcopy_sequence (str));
5911     }
5912
5913   if (coding->composing != COMPOSITION_DISABLED)
5914     coding_allocate_composition_data (coding, from);
5915   len = decoding_buffer_size (coding, to_byte - from);
5916   allocate_conversion_buffer (buf, len);
5917
5918   consumed = consumed_char = produced = produced_char = 0;
5919   while (1)
5920     {
5921       result = decode_coding (coding, XSTRING (str)->data + from + consumed,
5922                               buf.data + produced, to_byte - from - consumed,
5923                               buf.size - produced);
5924       consumed += coding->consumed;
5925       consumed_char += coding->consumed_char;
5926       produced += coding->produced;
5927       produced_char += coding->produced_char;
5928       if (result == CODING_FINISH_NORMAL
5929           || (result == CODING_FINISH_INSUFFICIENT_SRC
5930               && coding->consumed == 0))
5931         break;
5932       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5933         coding_allocate_composition_data (coding, from + produced_char);
5934       else if (result == CODING_FINISH_INSUFFICIENT_DST)
5935         extend_conversion_buffer (&buf);
5936       else if (result == CODING_FINISH_INCONSISTENT_EOL)
5937         {
5938           Lisp_Object eol_type;
5939
5940           /* Recover the original EOL format.  */
5941           if (coding->eol_type == CODING_EOL_CR)
5942             {
5943               unsigned char *p;
5944               for (p = buf.data; p < buf.data + produced; p++)
5945                 if (*p == '\n') *p = '\r';
5946             }
5947           else if (coding->eol_type == CODING_EOL_CRLF)
5948             {
5949               int num_eol = 0;
5950               unsigned char *p0, *p1;
5951               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
5952                 if (*p0 == '\n') num_eol++;
5953               if (produced + num_eol >= buf.size)
5954                 extend_conversion_buffer (&buf);
5955               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
5956                 {
5957                   *--p1 = *--p0;
5958                   if (*p0 == '\n') *--p1 = '\r';
5959                 }
5960               produced += num_eol;
5961               produced_char += num_eol;
5962             }
5963           /* Suppress eol-format conversion in the further conversion.  */
5964           coding->eol_type = CODING_EOL_LF;
5965
5966           /* Set the coding system symbol to that for Unix-like EOL.  */
5967           eol_type = Fget (saved_coding_symbol, Qeol_type);
5968           if (VECTORP (eol_type)
5969               && XVECTOR (eol_type)->size == 3
5970               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5971             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5972           else
5973             coding->symbol = saved_coding_symbol;
5974
5975
5976         }
5977     }
5978
5979   coding->consumed = consumed;
5980   coding->consumed_char = consumed_char;
5981   coding->produced = produced;
5982   coding->produced_char = produced_char;
5983
5984   if (coding->dst_multibyte)
5985     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
5986                                            produced + shrinked_bytes);
5987   else
5988     newstr = make_uninit_string (produced + shrinked_bytes);
5989   if (from > 0)
5990     bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from);
5991   bcopy (buf.data, XSTRING (newstr)->data + from, produced);
5992   if (shrinked_bytes > from)
5993     bcopy (XSTRING (str)->data + to_byte,
5994            XSTRING (newstr)->data + from + produced,
5995            shrinked_bytes - from);
5996   free_conversion_buffer (&buf);
5997
5998   if (coding->cmp_data && coding->cmp_data->used)
5999     coding_restore_composition (coding, newstr);
6000   coding_free_composition_data (coding);
6001
6002   if (SYMBOLP (coding->post_read_conversion)
6003       && !NILP (Ffboundp (coding->post_read_conversion)))
6004     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6005
6006   return newstr;
6007 }
6008
6009 Lisp_Object
6010 encode_coding_string (str, coding, nocopy)
6011      Lisp_Object str;
6012      struct coding_system *coding;
6013      int nocopy;
6014 {
6015   int len;
6016   struct conversion_buffer buf;
6017   int from, to, to_byte;
6018   int result;
6019   int shrinked_bytes = 0;
6020   Lisp_Object newstr;
6021   int consumed, consumed_char, produced, produced_char;
6022
6023   if (SYMBOLP (coding->pre_write_conversion)
6024       && !NILP (Ffboundp (coding->pre_write_conversion)))
6025     str = run_pre_post_conversion_on_str (str, coding, 1);
6026
6027   from = 0;
6028   to = XSTRING (str)->size;
6029   to_byte = STRING_BYTES (XSTRING (str));
6030
6031   /* Encoding routines determine the multibyteness of the source text
6032      by coding->src_multibyte.  */
6033   coding->src_multibyte = STRING_MULTIBYTE (str);
6034   coding->dst_multibyte = 0;
6035   if (! CODING_REQUIRE_ENCODING (coding))
6036     {
6037       coding->consumed = STRING_BYTES (XSTRING (str));
6038       coding->consumed_char = XSTRING (str)->size;
6039       if (STRING_MULTIBYTE (str))
6040         {
6041           str = Fstring_as_unibyte (str);
6042           nocopy = 1;
6043         }
6044       coding->produced = STRING_BYTES (XSTRING (str));
6045       coding->produced_char = XSTRING (str)->size;
6046       return (nocopy ? str : Fcopy_sequence (str));
6047     }
6048
6049   if (coding->composing != COMPOSITION_DISABLED)
6050     coding_save_composition (coding, from, to, str);
6051
6052   /* Try to skip the heading and tailing ASCIIs.  */
6053   if (coding->type != coding_type_ccl)
6054     {
6055       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
6056                                 1);
6057       if (from == to_byte)
6058         return (nocopy ? str : Fcopy_sequence (str));
6059       shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte);
6060     }
6061
6062   len = encoding_buffer_size (coding, to_byte - from);
6063   allocate_conversion_buffer (buf, len);
6064
6065   consumed = consumed_char = produced = produced_char = 0;
6066   while (1)
6067     {
6068       result = encode_coding (coding, XSTRING (str)->data + from + consumed,
6069                               buf.data + produced, to_byte - from - consumed,
6070                               buf.size - produced);
6071       consumed += coding->consumed;
6072       consumed_char += coding->consumed_char;
6073       produced += coding->produced;
6074       produced_char += coding->produced_char;
6075       if (result == CODING_FINISH_NORMAL
6076           || (result == CODING_FINISH_INSUFFICIENT_SRC
6077               && coding->consumed == 0))
6078         break;
6079       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6080       extend_conversion_buffer (&buf);
6081     }
6082
6083   coding->consumed = consumed;
6084   coding->consumed_char = consumed_char;
6085   coding->produced = produced;
6086   coding->produced_char = produced_char;
6087
6088   newstr = make_uninit_string (produced + shrinked_bytes);
6089   if (from > 0)
6090     bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from);
6091   bcopy (buf.data, XSTRING (newstr)->data + from, produced);
6092   if (shrinked_bytes > from)
6093     bcopy (XSTRING (str)->data + to_byte,
6094            XSTRING (newstr)->data + from + produced,
6095            shrinked_bytes - from);
6096
6097   free_conversion_buffer (&buf);
6098   coding_free_composition_data (coding);
6099
6100   return newstr;
6101 }
6102
6103 \f
6104 #ifdef emacs
6105 /*** 8. Emacs Lisp library functions ***/
6106
6107 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6108        doc: /* Return t if OBJECT is nil or a coding-system.
6109 See the documentation of `make-coding-system' for information
6110 about coding-system objects.  */)
6111      (obj)
6112      Lisp_Object obj;
6113 {
6114   if (NILP (obj))
6115     return Qt;
6116   if (!SYMBOLP (obj))
6117     return Qnil;
6118   /* Get coding-spec vector for OBJ.  */
6119   obj = Fget (obj, Qcoding_system);
6120   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6121           ? Qt : Qnil);
6122 }
6123
6124 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6125        Sread_non_nil_coding_system, 1, 1, 0,
6126        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6127      (prompt)
6128      Lisp_Object prompt;
6129 {
6130   Lisp_Object val;
6131   do
6132     {
6133       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6134                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6135     }
6136   while (XSTRING (val)->size == 0);
6137   return (Fintern (val, Qnil));
6138 }
6139
6140 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6141        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6142 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6143      (prompt, default_coding_system)
6144      Lisp_Object prompt, default_coding_system;
6145 {
6146   Lisp_Object val;
6147   if (SYMBOLP (default_coding_system))
6148     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6149   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6150                           Qt, Qnil, Qcoding_system_history,
6151                           default_coding_system, Qnil);
6152   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6153 }
6154
6155 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6156        1, 1, 0,
6157        doc: /* Check validity of CODING-SYSTEM.
6158 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6159 It is valid if it is a symbol with a non-nil `coding-system' property.
6160 The value of property should be a vector of length 5.  */)
6161      (coding_system)
6162      Lisp_Object coding_system;
6163 {
6164   CHECK_SYMBOL (coding_system);
6165   if (!NILP (Fcoding_system_p (coding_system)))
6166     return coding_system;
6167   while (1)
6168     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6169 }
6170 \f
6171 Lisp_Object
6172 detect_coding_system (src, src_bytes, highest, multibytep)
6173      unsigned char *src;
6174      int src_bytes, highest;
6175      int multibytep;
6176 {
6177   int coding_mask, eol_type;
6178   Lisp_Object val, tmp;
6179   int dummy;
6180
6181   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6182   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6183   if (eol_type == CODING_EOL_INCONSISTENT)
6184     eol_type = CODING_EOL_UNDECIDED;
6185
6186   if (!coding_mask)
6187     {
6188       val = Qundecided;
6189       if (eol_type != CODING_EOL_UNDECIDED)
6190         {
6191           Lisp_Object val2;
6192           val2 = Fget (Qundecided, Qeol_type);
6193           if (VECTORP (val2))
6194             val = XVECTOR (val2)->contents[eol_type];
6195         }
6196       return (highest ? val : Fcons (val, Qnil));
6197     }
6198
6199   /* At first, gather possible coding systems in VAL.  */
6200   val = Qnil;
6201   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6202     {
6203       Lisp_Object category_val, category_index;
6204
6205       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6206       category_val = Fsymbol_value (XCAR (tmp));
6207       if (!NILP (category_val)
6208           && NATNUMP (category_index)
6209           && (coding_mask & (1 << XFASTINT (category_index))))
6210         {
6211           val = Fcons (category_val, val);
6212           if (highest)
6213             break;
6214         }
6215     }
6216   if (!highest)
6217     val = Fnreverse (val);
6218
6219   /* Then, replace the elements with subsidiary coding systems.  */
6220   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6221     {
6222       if (eol_type != CODING_EOL_UNDECIDED
6223           && eol_type != CODING_EOL_INCONSISTENT)
6224         {
6225           Lisp_Object eol;
6226           eol = Fget (XCAR (tmp), Qeol_type);
6227           if (VECTORP (eol))
6228             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6229         }
6230     }
6231   return (highest ? XCAR (val) : val);
6232 }
6233
6234 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6235        2, 3, 0,
6236        doc: /* Detect coding system of the text in the region between START and END.
6237 Return a list of possible coding systems ordered by priority.
6238
6239 If only ASCII characters are found, it returns a list of single element
6240 `undecided' or its subsidiary coding system according to a detected
6241 end-of-line format.
6242
6243 If optional argument HIGHEST is non-nil, return the coding system of
6244 highest priority.  */)
6245      (start, end, highest)
6246      Lisp_Object start, end, highest;
6247 {
6248   int from, to;
6249   int from_byte, to_byte;
6250   int include_anchor_byte = 0;
6251
6252   CHECK_NUMBER_COERCE_MARKER (start);
6253   CHECK_NUMBER_COERCE_MARKER (end);
6254
6255   validate_region (&start, &end);
6256   from = XINT (start), to = XINT (end);
6257   from_byte = CHAR_TO_BYTE (from);
6258   to_byte = CHAR_TO_BYTE (to);
6259
6260   if (from < GPT && to >= GPT)
6261     move_gap_both (to, to_byte);
6262   /* If we an anchor byte `\0' follows the region, we include it in
6263      the detecting source.  Then code detectors can handle the tailing
6264      byte sequence more accurately.
6265
6266      Fix me: This is not an perfect solution.  It is better that we
6267      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6268   */
6269   if (to == Z || (to == GPT && GAP_SIZE > 0))
6270     include_anchor_byte = 1;
6271   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6272                                to_byte - from_byte + include_anchor_byte,
6273                                !NILP (highest),
6274                                !NILP (current_buffer
6275                                       ->enable_multibyte_characters));
6276 }
6277
6278 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6279        1, 2, 0,
6280        doc: /* Detect coding system of the text in STRING.
6281 Return a list of possible coding systems ordered by priority.
6282
6283 If only ASCII characters are found, it returns a list of single element
6284 `undecided' or its subsidiary coding system according to a detected
6285 end-of-line format.
6286
6287 If optional argument HIGHEST is non-nil, return the coding system of
6288 highest priority.  */)
6289      (string, highest)
6290      Lisp_Object string, highest;
6291 {
6292   CHECK_STRING (string);
6293
6294   return detect_coding_system (XSTRING (string)->data,
6295                                /* "+ 1" is to include the anchor byte
6296                                   `\0'.  With this, code detectors can
6297                                   handle the tailing bytes more
6298                                   accurately.  */
6299                                STRING_BYTES (XSTRING (string)) + 1,
6300                                !NILP (highest),
6301                                STRING_MULTIBYTE (string));
6302 }
6303
6304 /* Return an intersection of lists L1 and L2.  */
6305
6306 static Lisp_Object
6307 intersection (l1, l2)
6308      Lisp_Object l1, l2;
6309 {
6310   Lisp_Object val;
6311
6312   for (val = Qnil; CONSP (l1); l1 = XCDR (l1))
6313     {
6314       if (!NILP (Fmemq (XCAR (l1), l2)))
6315         val = Fcons (XCAR (l1), val);
6316     }
6317   return val;
6318 }
6319
6320
6321 /*  Subroutine for Fsafe_coding_systems_region_internal.
6322
6323     Return a list of coding systems that safely encode the multibyte
6324     text between P and PEND.  SAFE_CODINGS, if non-nil, is a list of
6325     possible coding systems.  If it is nil, it means that we have not
6326     yet found any coding systems.
6327
6328     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
6329     element of WORK_TABLE is set to t once the element is looked up.
6330
6331     If a non-ASCII single byte char is found, set
6332     *single_byte_char_found to 1.  */
6333
6334 static Lisp_Object
6335 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6336      unsigned char *p, *pend;
6337      Lisp_Object safe_codings, work_table;
6338      int *single_byte_char_found;
6339 {
6340   int c, len, idx;
6341   Lisp_Object val;
6342
6343   while (p < pend)
6344     {
6345       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6346       p += len;
6347       if (ASCII_BYTE_P (c))
6348         /* We can ignore ASCII characters here.  */
6349         continue;
6350       if (SINGLE_BYTE_CHAR_P (c))
6351         *single_byte_char_found = 1;
6352       if (NILP (safe_codings))
6353         continue;
6354       /* Check the safe coding systems for C.  */
6355       val = char_table_ref_and_index (work_table, c, &idx);
6356       if (EQ (val, Qt))
6357         /* This element was already checked.  Ignore it.  */
6358         continue;
6359       /* Remember that we checked this element.  */
6360       CHAR_TABLE_SET (work_table, make_number (idx), Qt);
6361
6362       /* If there are some safe coding systems for C and we have
6363          already found the other set of coding systems for the
6364          different characters, get the intersection of them.  */
6365       if (!EQ (safe_codings, Qt) && !NILP (val))
6366         val = intersection (safe_codings, val);
6367       safe_codings = val;
6368     }
6369   return safe_codings;
6370 }
6371
6372
6373 /* Return a list of coding systems that safely encode the text between
6374    START and END.  If the text contains only ASCII or is unibyte,
6375    return t.  */
6376
6377 DEFUN ("find-coding-systems-region-internal",
6378        Ffind_coding_systems_region_internal,
6379        Sfind_coding_systems_region_internal, 2, 2, 0,
6380        doc: /* Internal use only.  */)
6381      (start, end)
6382      Lisp_Object start, end;
6383 {
6384   Lisp_Object work_table, safe_codings;
6385   int non_ascii_p = 0;
6386   int single_byte_char_found = 0;
6387   unsigned char *p1, *p1end, *p2, *p2end, *p;
6388
6389   if (STRINGP (start))
6390     {
6391       if (!STRING_MULTIBYTE (start))
6392         return Qt;
6393       p1 = XSTRING (start)->data, p1end = p1 + STRING_BYTES (XSTRING (start));
6394       p2 = p2end = p1end;
6395       if (XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6396         non_ascii_p = 1;
6397     }
6398   else
6399     {
6400       int from, to, stop;
6401
6402       CHECK_NUMBER_COERCE_MARKER (start);
6403       CHECK_NUMBER_COERCE_MARKER (end);
6404       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6405         args_out_of_range (start, end);
6406       if (NILP (current_buffer->enable_multibyte_characters))
6407         return Qt;
6408       from = CHAR_TO_BYTE (XINT (start));
6409       to = CHAR_TO_BYTE (XINT (end));
6410       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6411       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6412       if (stop == to)
6413         p2 = p2end = p1end;
6414       else
6415         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6416       if (XINT (end) - XINT (start) != to - from)
6417         non_ascii_p = 1;
6418     }
6419
6420   if (!non_ascii_p)
6421     {
6422       /* We are sure that the text contains no multibyte character.
6423          Check if it contains eight-bit-graphic.  */
6424       p = p1;
6425       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6426       if (p == p1end)
6427         {
6428           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6429           if (p == p2end)
6430             return Qt;
6431         }
6432     }
6433
6434   /* The text contains non-ASCII characters.  */
6435   work_table = Fcopy_sequence (Vchar_coding_system_table);
6436   safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
6437                                     &single_byte_char_found);
6438   if (p2 < p2end)
6439     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6440                                       &single_byte_char_found);
6441
6442   if (EQ (safe_codings, Qt))
6443     ; /* Nothing to be done.  */
6444   else if (!single_byte_char_found)
6445     {
6446       /* Append generic coding systems.  */
6447       Lisp_Object args[2];
6448       args[0] = safe_codings;
6449       args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
6450                                         make_number (0));
6451       safe_codings = Fappend (2, args);
6452     }
6453   else
6454     safe_codings = Fcons (Qraw_text,
6455                           Fcons (Qemacs_mule,
6456                                  Fcons (Qno_conversion, safe_codings)));
6457   return safe_codings;
6458 }
6459
6460
6461 Lisp_Object
6462 code_convert_region1 (start, end, coding_system, encodep)
6463      Lisp_Object start, end, coding_system;
6464      int encodep;
6465 {
6466   struct coding_system coding;
6467   int from, to;
6468
6469   CHECK_NUMBER_COERCE_MARKER (start);
6470   CHECK_NUMBER_COERCE_MARKER (end);
6471   CHECK_SYMBOL (coding_system);
6472
6473   validate_region (&start, &end);
6474   from = XFASTINT (start);
6475   to = XFASTINT (end);
6476
6477   if (NILP (coding_system))
6478     return make_number (to - from);
6479
6480   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6481     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
6482
6483   coding.mode |= CODING_MODE_LAST_BLOCK;
6484   coding.src_multibyte = coding.dst_multibyte
6485     = !NILP (current_buffer->enable_multibyte_characters);
6486   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6487                        &coding, encodep, 1);
6488   Vlast_coding_system_used = coding.symbol;
6489   return make_number (coding.produced_char);
6490 }
6491
6492 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6493        3, 3, "r\nzCoding system: ",
6494        doc: /* Decode the current region from the specified coding system.
6495 When called from a program, takes three arguments:
6496 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6497 This function sets `last-coding-system-used' to the precise coding system
6498 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6499 not fully specified.)
6500 It returns the length of the decoded text.  */)
6501      (start, end, coding_system)
6502      Lisp_Object start, end, coding_system;
6503 {
6504   return code_convert_region1 (start, end, coding_system, 0);
6505 }
6506
6507 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6508        3, 3, "r\nzCoding system: ",
6509        doc: /* Encode the current region into the specified coding system.
6510 When called from a program, takes three arguments:
6511 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6512 This function sets `last-coding-system-used' to the precise coding system
6513 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6514 not fully specified.)
6515 It returns the length of the encoded text.  */)
6516      (start, end, coding_system)
6517      Lisp_Object start, end, coding_system;
6518 {
6519   return code_convert_region1 (start, end, coding_system, 1);
6520 }
6521
6522 Lisp_Object
6523 code_convert_string1 (string, coding_system, nocopy, encodep)
6524      Lisp_Object string, coding_system, nocopy;
6525      int encodep;
6526 {
6527   struct coding_system coding;
6528
6529   CHECK_STRING (string);
6530   CHECK_SYMBOL (coding_system);
6531
6532   if (NILP (coding_system))
6533     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6534
6535   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6536     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
6537
6538   coding.mode |= CODING_MODE_LAST_BLOCK;
6539   string = (encodep
6540             ? encode_coding_string (string, &coding, !NILP (nocopy))
6541             : decode_coding_string (string, &coding, !NILP (nocopy)));
6542   Vlast_coding_system_used = coding.symbol;
6543
6544   return string;
6545 }
6546
6547 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6548        2, 3, 0,
6549        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6550 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6551 if the decoding operation is trivial.
6552 This function sets `last-coding-system-used' to the precise coding system
6553 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6554 not fully specified.)  */)
6555      (string, coding_system, nocopy)
6556      Lisp_Object string, coding_system, nocopy;
6557 {
6558   return code_convert_string1 (string, coding_system, nocopy, 0);
6559 }
6560
6561 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6562        2, 3, 0,
6563        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6564 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6565 if the encoding operation is trivial.
6566 This function sets `last-coding-system-used' to the precise coding system
6567 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6568 not fully specified.)  */)
6569      (string, coding_system, nocopy)
6570      Lisp_Object string, coding_system, nocopy;
6571 {
6572   return code_convert_string1 (string, coding_system, nocopy, 1);
6573 }
6574
6575 /* Encode or decode STRING according to CODING_SYSTEM.
6576    Do not set Vlast_coding_system_used.
6577
6578    This function is called only from macros DECODE_FILE and
6579    ENCODE_FILE, thus we ignore character composition.  */
6580
6581 Lisp_Object
6582 code_convert_string_norecord (string, coding_system, encodep)
6583      Lisp_Object string, coding_system;
6584      int encodep;
6585 {
6586   struct coding_system coding;
6587
6588   CHECK_STRING (string);
6589   CHECK_SYMBOL (coding_system);
6590
6591   if (NILP (coding_system))
6592     return string;
6593
6594   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6595     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
6596
6597   coding.composing = COMPOSITION_DISABLED;
6598   coding.mode |= CODING_MODE_LAST_BLOCK;
6599   return (encodep
6600           ? encode_coding_string (string, &coding, 1)
6601           : decode_coding_string (string, &coding, 1));
6602 }
6603 \f
6604 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6605        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6606 Return the corresponding character.  */)
6607      (code)
6608      Lisp_Object code;
6609 {
6610   unsigned char c1, c2, s1, s2;
6611   Lisp_Object val;
6612
6613   CHECK_NUMBER (code);
6614   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6615   if (s1 == 0)
6616     {
6617       if (s2 < 0x80)
6618         XSETFASTINT (val, s2);
6619       else if (s2 >= 0xA0 || s2 <= 0xDF)
6620         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
6621       else
6622         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6623     }
6624   else
6625     {
6626       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
6627           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
6628         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6629       DECODE_SJIS (s1, s2, c1, c2);
6630       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
6631     }
6632   return val;
6633 }
6634
6635 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6636        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6637 Return the corresponding code in SJIS.  */)
6638      (ch)
6639      Lisp_Object ch;
6640 {
6641   int charset, c1, c2, s1, s2;
6642   Lisp_Object val;
6643
6644   CHECK_NUMBER (ch);
6645   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6646   if (charset == CHARSET_ASCII)
6647     {
6648       val = ch;
6649     }
6650   else if (charset == charset_jisx0208
6651            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
6652     {
6653       ENCODE_SJIS (c1, c2, s1, s2);
6654       XSETFASTINT (val, (s1 << 8) | s2);
6655     }
6656   else if (charset == charset_katakana_jisx0201
6657            && c1 > 0x20 && c2 < 0xE0)
6658     {
6659       XSETFASTINT (val, c1 | 0x80);
6660     }
6661   else
6662     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
6663   return val;
6664 }
6665
6666 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6667        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6668 Return the corresponding character.  */)
6669      (code)
6670      Lisp_Object code;
6671 {
6672   int charset;
6673   unsigned char b1, b2, c1, c2;
6674   Lisp_Object val;
6675
6676   CHECK_NUMBER (code);
6677   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
6678   if (b1 == 0)
6679     {
6680       if (b2 >= 0x80)
6681         error ("Invalid BIG5 code: %x", XFASTINT (code));
6682       val = code;
6683     }
6684   else
6685     {
6686       if ((b1 < 0xA1 || b1 > 0xFE)
6687           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
6688         error ("Invalid BIG5 code: %x", XFASTINT (code));
6689       DECODE_BIG5 (b1, b2, charset, c1, c2);
6690       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
6691     }
6692   return val;
6693 }
6694
6695 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6696        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6697 Return the corresponding character code in Big5.  */)
6698      (ch)
6699      Lisp_Object ch;
6700 {
6701   int charset, c1, c2, b1, b2;
6702   Lisp_Object val;
6703
6704   CHECK_NUMBER (ch);
6705   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6706   if (charset == CHARSET_ASCII)
6707     {
6708       val = ch;
6709     }
6710   else if ((charset == charset_big5_1
6711             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
6712            || (charset == charset_big5_2
6713                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
6714     {
6715       ENCODE_BIG5 (charset, c1, c2, b1, b2);
6716       XSETFASTINT (val, (b1 << 8) | b2);
6717     }
6718   else
6719     error ("Can't encode to Big5: %d", XFASTINT (ch));
6720   return val;
6721 }
6722 \f
6723 DEFUN ("set-terminal-coding-system-internal",
6724        Fset_terminal_coding_system_internal,
6725        Sset_terminal_coding_system_internal, 1, 1, 0,
6726        doc: /* Internal use only.  */)
6727      (coding_system)
6728      Lisp_Object coding_system;
6729 {
6730   CHECK_SYMBOL (coding_system);
6731   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6732   /* We had better not send unsafe characters to terminal.  */
6733   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
6734   /* Character composition should be disabled.  */
6735   terminal_coding.composing = COMPOSITION_DISABLED;
6736   /* Error notification should be suppressed.  */
6737   terminal_coding.suppress_error = 1;
6738   terminal_coding.src_multibyte = 1;
6739   terminal_coding.dst_multibyte = 0;
6740   return Qnil;
6741 }
6742
6743 DEFUN ("set-safe-terminal-coding-system-internal",
6744        Fset_safe_terminal_coding_system_internal,
6745        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
6746        doc: /* Internal use only.  */)
6747      (coding_system)
6748      Lisp_Object coding_system;
6749 {
6750   CHECK_SYMBOL (coding_system);
6751   setup_coding_system (Fcheck_coding_system (coding_system),
6752                        &safe_terminal_coding);
6753   /* Character composition should be disabled.  */
6754   safe_terminal_coding.composing = COMPOSITION_DISABLED;
6755   /* Error notification should be suppressed.  */
6756   terminal_coding.suppress_error = 1;
6757   safe_terminal_coding.src_multibyte = 1;
6758   safe_terminal_coding.dst_multibyte = 0;
6759   return Qnil;
6760 }
6761
6762 DEFUN ("terminal-coding-system",
6763        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6764        doc: /* Return coding system specified for terminal output.  */)
6765      ()
6766 {
6767   return terminal_coding.symbol;
6768 }
6769
6770 DEFUN ("set-keyboard-coding-system-internal",
6771        Fset_keyboard_coding_system_internal,
6772        Sset_keyboard_coding_system_internal, 1, 1, 0,
6773        doc: /* Internal use only.  */)
6774      (coding_system)
6775      Lisp_Object coding_system;
6776 {
6777   CHECK_SYMBOL (coding_system);
6778   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
6779   /* Character composition should be disabled.  */
6780   keyboard_coding.composing = COMPOSITION_DISABLED;
6781   return Qnil;
6782 }
6783
6784 DEFUN ("keyboard-coding-system",
6785        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6786        doc: /* Return coding system specified for decoding keyboard input.  */)
6787      ()
6788 {
6789   return keyboard_coding.symbol;
6790 }
6791
6792 \f
6793 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6794        Sfind_operation_coding_system,  1, MANY, 0,
6795        doc: /* Choose a coding system for an operation based on the target name.
6796 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
6797 DECODING-SYSTEM is the coding system to use for decoding
6798 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
6799 for encoding (in case OPERATION does encoding).
6800
6801 The first argument OPERATION specifies an I/O primitive:
6802   For file I/O, `insert-file-contents' or `write-region'.
6803   For process I/O, `call-process', `call-process-region', or `start-process'.
6804   For network I/O, `open-network-stream'.
6805
6806 The remaining arguments should be the same arguments that were passed
6807 to the primitive.  Depending on which primitive, one of those arguments
6808 is selected as the TARGET.  For example, if OPERATION does file I/O,
6809 whichever argument specifies the file name is TARGET.
6810
6811 TARGET has a meaning which depends on OPERATION:
6812   For file I/O, TARGET is a file name.
6813   For process I/O, TARGET is a process name.
6814   For network I/O, TARGET is a service name or a port number
6815
6816 This function looks up what specified for TARGET in,
6817 `file-coding-system-alist', `process-coding-system-alist',
6818 or `network-coding-system-alist' depending on OPERATION.
6819 They may specify a coding system, a cons of coding systems,
6820 or a function symbol to call.
6821 In the last case, we call the function with one argument,
6822 which is a list of all the arguments given to this function.
6823
6824 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
6825      (nargs, args)
6826      int nargs;
6827      Lisp_Object *args;
6828 {
6829   Lisp_Object operation, target_idx, target, val;
6830   register Lisp_Object chain;
6831
6832   if (nargs < 2)
6833     error ("Too few arguments");
6834   operation = args[0];
6835   if (!SYMBOLP (operation)
6836       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
6837     error ("Invalid first argument");
6838   if (nargs < 1 + XINT (target_idx))
6839     error ("Too few arguments for operation: %s",
6840            XSYMBOL (operation)->name->data);
6841   target = args[XINT (target_idx) + 1];
6842   if (!(STRINGP (target)
6843         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
6844     error ("Invalid argument %d", XINT (target_idx) + 1);
6845
6846   chain = ((EQ (operation, Qinsert_file_contents)
6847             || EQ (operation, Qwrite_region))
6848            ? Vfile_coding_system_alist
6849            : (EQ (operation, Qopen_network_stream)
6850               ? Vnetwork_coding_system_alist
6851               : Vprocess_coding_system_alist));
6852   if (NILP (chain))
6853     return Qnil;
6854
6855   for (; CONSP (chain); chain = XCDR (chain))
6856     {
6857       Lisp_Object elt;
6858       elt = XCAR (chain);
6859
6860       if (CONSP (elt)
6861           && ((STRINGP (target)
6862                && STRINGP (XCAR (elt))
6863                && fast_string_match (XCAR (elt), target) >= 0)
6864               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6865         {
6866           val = XCDR (elt);
6867           /* Here, if VAL is both a valid coding system and a valid
6868              function symbol, we return VAL as a coding system.  */
6869           if (CONSP (val))
6870             return val;
6871           if (! SYMBOLP (val))
6872             return Qnil;
6873           if (! NILP (Fcoding_system_p (val)))
6874             return Fcons (val, val);
6875           if (! NILP (Ffboundp (val)))
6876             {
6877               val = call1 (val, Flist (nargs, args));
6878               if (CONSP (val))
6879                 return val;
6880               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
6881                 return Fcons (val, val);
6882             }
6883           return Qnil;
6884         }
6885     }
6886   return Qnil;
6887 }
6888
6889 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
6890        Supdate_coding_systems_internal, 0, 0, 0,
6891        doc: /* Update internal database for ISO2022 and CCL based coding systems.
6892 When values of any coding categories are changed, you must
6893 call this function.  */)
6894      ()
6895 {
6896   int i;
6897
6898   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
6899     {
6900       Lisp_Object val;
6901
6902       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
6903       if (!NILP (val))
6904         {
6905           if (! coding_system_table[i])
6906             coding_system_table[i] = ((struct coding_system *)
6907                                       xmalloc (sizeof (struct coding_system)));
6908           setup_coding_system (val, coding_system_table[i]);
6909         }
6910       else if (coding_system_table[i])
6911         {
6912           xfree (coding_system_table[i]);
6913           coding_system_table[i] = NULL;
6914         }
6915     }
6916
6917   return Qnil;
6918 }
6919
6920 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
6921        Sset_coding_priority_internal, 0, 0, 0,
6922        doc: /* Update internal database for the current value of `coding-category-list'.
6923 This function is internal use only.  */)
6924      ()
6925 {
6926   int i = 0, idx;
6927   Lisp_Object val;
6928
6929   val = Vcoding_category_list;
6930
6931   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
6932     {
6933       if (! SYMBOLP (XCAR (val)))
6934         break;
6935       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
6936       if (idx >= CODING_CATEGORY_IDX_MAX)
6937         break;
6938       coding_priorities[i++] = (1 << idx);
6939       val = XCDR (val);
6940     }
6941   /* If coding-category-list is valid and contains all coding
6942      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
6943      the following code saves Emacs from crashing.  */
6944   while (i < CODING_CATEGORY_IDX_MAX)
6945     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6946
6947   return Qnil;
6948 }
6949
6950 #endif /* emacs */
6951
6952 \f
6953 /*** 9. Post-amble ***/
6954
6955 void
6956 init_coding_once ()
6957 {
6958   int i;
6959
6960   /* Emacs' internal format specific initialize routine.  */
6961   for (i = 0; i <= 0x20; i++)
6962     emacs_code_class[i] = EMACS_control_code;
6963   emacs_code_class[0x0A] = EMACS_linefeed_code;
6964   emacs_code_class[0x0D] = EMACS_carriage_return_code;
6965   for (i = 0x21 ; i < 0x7F; i++)
6966     emacs_code_class[i] = EMACS_ascii_code;
6967   emacs_code_class[0x7F] = EMACS_control_code;
6968   for (i = 0x80; i < 0xFF; i++)
6969     emacs_code_class[i] = EMACS_invalid_code;
6970   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6971   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6972   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6973   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6974
6975   /* ISO2022 specific initialize routine.  */
6976   for (i = 0; i < 0x20; i++)
6977     iso_code_class[i] = ISO_control_0;
6978   for (i = 0x21; i < 0x7F; i++)
6979     iso_code_class[i] = ISO_graphic_plane_0;
6980   for (i = 0x80; i < 0xA0; i++)
6981     iso_code_class[i] = ISO_control_1;
6982   for (i = 0xA1; i < 0xFF; i++)
6983     iso_code_class[i] = ISO_graphic_plane_1;
6984   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6985   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6986   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6987   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6988   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6989   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6990   iso_code_class[ISO_CODE_ESC] = ISO_escape;
6991   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6992   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6993   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6994
6995   setup_coding_system (Qnil, &keyboard_coding);
6996   setup_coding_system (Qnil, &terminal_coding);
6997   setup_coding_system (Qnil, &safe_terminal_coding);
6998   setup_coding_system (Qnil, &default_buffer_file_coding);
6999
7000   bzero (coding_system_table, sizeof coding_system_table);
7001
7002   bzero (ascii_skip_code, sizeof ascii_skip_code);
7003   for (i = 0; i < 128; i++)
7004     ascii_skip_code[i] = 1;
7005
7006 #if defined (MSDOS) || defined (WINDOWSNT)
7007   system_eol_type = CODING_EOL_CRLF;
7008 #else
7009   system_eol_type = CODING_EOL_LF;
7010 #endif
7011
7012   inhibit_pre_post_conversion = 0;
7013 }
7014
7015 #ifdef emacs
7016
7017 void
7018 syms_of_coding ()
7019 {
7020   Qtarget_idx = intern ("target-idx");
7021   staticpro (&Qtarget_idx);
7022
7023   Qcoding_system_history = intern ("coding-system-history");
7024   staticpro (&Qcoding_system_history);
7025   Fset (Qcoding_system_history, Qnil);
7026
7027   /* Target FILENAME is the first argument.  */
7028   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7029   /* Target FILENAME is the third argument.  */
7030   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7031
7032   Qcall_process = intern ("call-process");
7033   staticpro (&Qcall_process);
7034   /* Target PROGRAM is the first argument.  */
7035   Fput (Qcall_process, Qtarget_idx, make_number (0));
7036
7037   Qcall_process_region = intern ("call-process-region");
7038   staticpro (&Qcall_process_region);
7039   /* Target PROGRAM is the third argument.  */
7040   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7041
7042   Qstart_process = intern ("start-process");
7043   staticpro (&Qstart_process);
7044   /* Target PROGRAM is the third argument.  */
7045   Fput (Qstart_process, Qtarget_idx, make_number (2));
7046
7047   Qopen_network_stream = intern ("open-network-stream");
7048   staticpro (&Qopen_network_stream);
7049   /* Target SERVICE is the fourth argument.  */
7050   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7051
7052   Qcoding_system = intern ("coding-system");
7053   staticpro (&Qcoding_system);
7054
7055   Qeol_type = intern ("eol-type");
7056   staticpro (&Qeol_type);
7057
7058   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7059   staticpro (&Qbuffer_file_coding_system);
7060
7061   Qpost_read_conversion = intern ("post-read-conversion");
7062   staticpro (&Qpost_read_conversion);
7063
7064   Qpre_write_conversion = intern ("pre-write-conversion");
7065   staticpro (&Qpre_write_conversion);
7066
7067   Qno_conversion = intern ("no-conversion");
7068   staticpro (&Qno_conversion);
7069
7070   Qundecided = intern ("undecided");
7071   staticpro (&Qundecided);
7072
7073   Qcoding_system_p = intern ("coding-system-p");
7074   staticpro (&Qcoding_system_p);
7075
7076   Qcoding_system_error = intern ("coding-system-error");
7077   staticpro (&Qcoding_system_error);
7078
7079   Fput (Qcoding_system_error, Qerror_conditions,
7080         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7081   Fput (Qcoding_system_error, Qerror_message,
7082         build_string ("Invalid coding system"));
7083
7084   Qcoding_category = intern ("coding-category");
7085   staticpro (&Qcoding_category);
7086   Qcoding_category_index = intern ("coding-category-index");
7087   staticpro (&Qcoding_category_index);
7088
7089   Vcoding_category_table
7090     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7091   staticpro (&Vcoding_category_table);
7092   {
7093     int i;
7094     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7095       {
7096         XVECTOR (Vcoding_category_table)->contents[i]
7097           = intern (coding_category_name[i]);
7098         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7099               Qcoding_category_index, make_number (i));
7100       }
7101   }
7102
7103   Qtranslation_table = intern ("translation-table");
7104   staticpro (&Qtranslation_table);
7105   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7106
7107   Qtranslation_table_id = intern ("translation-table-id");
7108   staticpro (&Qtranslation_table_id);
7109
7110   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7111   staticpro (&Qtranslation_table_for_decode);
7112
7113   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7114   staticpro (&Qtranslation_table_for_encode);
7115
7116   Qsafe_chars = intern ("safe-chars");
7117   staticpro (&Qsafe_chars);
7118
7119   Qchar_coding_system = intern ("char-coding-system");
7120   staticpro (&Qchar_coding_system);
7121
7122   /* Intern this now in case it isn't already done.
7123      Setting this variable twice is harmless.
7124      But don't staticpro it here--that is done in alloc.c.  */
7125   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7126   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7127   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
7128
7129   Qvalid_codes = intern ("valid-codes");
7130   staticpro (&Qvalid_codes);
7131
7132   Qemacs_mule = intern ("emacs-mule");
7133   staticpro (&Qemacs_mule);
7134
7135   Qraw_text = intern ("raw-text");
7136   staticpro (&Qraw_text);
7137
7138   defsubr (&Scoding_system_p);
7139   defsubr (&Sread_coding_system);
7140   defsubr (&Sread_non_nil_coding_system);
7141   defsubr (&Scheck_coding_system);
7142   defsubr (&Sdetect_coding_region);
7143   defsubr (&Sdetect_coding_string);
7144   defsubr (&Sfind_coding_systems_region_internal);
7145   defsubr (&Sdecode_coding_region);
7146   defsubr (&Sencode_coding_region);
7147   defsubr (&Sdecode_coding_string);
7148   defsubr (&Sencode_coding_string);
7149   defsubr (&Sdecode_sjis_char);
7150   defsubr (&Sencode_sjis_char);
7151   defsubr (&Sdecode_big5_char);
7152   defsubr (&Sencode_big5_char);
7153   defsubr (&Sset_terminal_coding_system_internal);
7154   defsubr (&Sset_safe_terminal_coding_system_internal);
7155   defsubr (&Sterminal_coding_system);
7156   defsubr (&Sset_keyboard_coding_system_internal);
7157   defsubr (&Skeyboard_coding_system);
7158   defsubr (&Sfind_operation_coding_system);
7159   defsubr (&Supdate_coding_systems_internal);
7160   defsubr (&Sset_coding_priority_internal);
7161
7162   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7163                doc: /* List of coding systems.
7164
7165 Do not alter the value of this variable manually.  This variable should be
7166 updated by the functions `make-coding-system' and
7167 `define-coding-system-alias'.  */);
7168   Vcoding_system_list = Qnil;
7169
7170   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7171                doc: /* Alist of coding system names.
7172 Each element is one element list of coding system name.
7173 This variable is given to `completing-read' as TABLE argument.
7174
7175 Do not alter the value of this variable manually.  This variable should be
7176 updated by the functions `make-coding-system' and
7177 `define-coding-system-alias'.  */);
7178   Vcoding_system_alist = Qnil;
7179
7180   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7181                doc: /* List of coding-categories (symbols) ordered by priority.
7182
7183 On detecting a coding system, Emacs tries code detection algorithms
7184 associated with each coding-category one by one in this order.  When
7185 one algorithm agrees with a byte sequence of source text, the coding
7186 system bound to the corresponding coding-category is selected.  */);
7187   {
7188     int i;
7189
7190     Vcoding_category_list = Qnil;
7191     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7192       Vcoding_category_list
7193         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7194                  Vcoding_category_list);
7195   }
7196
7197   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7198                doc: /* Specify the coding system for read operations.
7199 It is useful to bind this variable with `let', but do not set it globally.
7200 If the value is a coding system, it is used for decoding on read operation.
7201 If not, an appropriate element is used from one of the coding system alists:
7202 There are three such tables, `file-coding-system-alist',
7203 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7204   Vcoding_system_for_read = Qnil;
7205
7206   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7207                doc: /* Specify the coding system for write operations.
7208 Programs bind this variable with `let', but you should not set it globally.
7209 If the value is a coding system, it is used for encoding of output,
7210 when writing it to a file and when sending it to a file or subprocess.
7211
7212 If this does not specify a coding system, an appropriate element
7213 is used from one of the coding system alists:
7214 There are three such tables, `file-coding-system-alist',
7215 `process-coding-system-alist', and `network-coding-system-alist'.
7216 For output to files, if the above procedure does not specify a coding system,
7217 the value of `buffer-file-coding-system' is used.  */);
7218   Vcoding_system_for_write = Qnil;
7219
7220   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7221                doc: /* Coding system used in the latest file or process I/O.  */);
7222   Vlast_coding_system_used = Qnil;
7223
7224   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7225                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7226 See info node `Coding Systems' and info node `Text and Binary' concerning
7227 such conversion.  */);
7228   inhibit_eol_conversion = 0;
7229
7230   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7231                doc: /* Non-nil means process buffer inherits coding system of process output.
7232 Bind it to t if the process output is to be treated as if it were a file
7233 read from some filesystem.  */);
7234   inherit_process_coding_system = 0;
7235
7236   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7237                doc: /* Alist to decide a coding system to use for a file I/O operation.
7238 The format is ((PATTERN . VAL) ...),
7239 where PATTERN is a regular expression matching a file name,
7240 VAL is a coding system, a cons of coding systems, or a function symbol.
7241 If VAL is a coding system, it is used for both decoding and encoding
7242 the file contents.
7243 If VAL is a cons of coding systems, the car part is used for decoding,
7244 and the cdr part is used for encoding.
7245 If VAL is a function symbol, the function must return a coding system
7246 or a cons of coding systems which are used as above.  The function gets
7247 the arguments with which `find-operation-coding-systems' was called.
7248
7249 See also the function `find-operation-coding-system'
7250 and the variable `auto-coding-alist'.  */);
7251   Vfile_coding_system_alist = Qnil;
7252
7253   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7254     doc: /* Alist to decide a coding system to use for a process I/O operation.
7255 The format is ((PATTERN . VAL) ...),
7256 where PATTERN is a regular expression matching a program name,
7257 VAL is a coding system, a cons of coding systems, or a function symbol.
7258 If VAL is a coding system, it is used for both decoding what received
7259 from the program and encoding what sent to the program.
7260 If VAL is a cons of coding systems, the car part is used for decoding,
7261 and the cdr part is used for encoding.
7262 If VAL is a function symbol, the function must return a coding system
7263 or a cons of coding systems which are used as above.
7264
7265 See also the function `find-operation-coding-system'.  */);
7266   Vprocess_coding_system_alist = Qnil;
7267
7268   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7269     doc: /* Alist to decide a coding system to use for a network I/O operation.
7270 The format is ((PATTERN . VAL) ...),
7271 where PATTERN is a regular expression matching a network service name
7272 or is a port number to connect to,
7273 VAL is a coding system, a cons of coding systems, or a function symbol.
7274 If VAL is a coding system, it is used for both decoding what received
7275 from the network stream and encoding what sent to the network stream.
7276 If VAL is a cons of coding systems, the car part is used for decoding,
7277 and the cdr part is used for encoding.
7278 If VAL is a function symbol, the function must return a coding system
7279 or a cons of coding systems which are used as above.
7280
7281 See also the function `find-operation-coding-system'.  */);
7282   Vnetwork_coding_system_alist = Qnil;
7283
7284   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7285                doc: /* Coding system to use with system messages.
7286 Also used for decoding keyboard input on X Window system.  */);
7287   Vlocale_coding_system = Qnil;
7288
7289   /* The eol mnemonics are reset in startup.el system-dependently.  */
7290   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7291                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7292   eol_mnemonic_unix = build_string (":");
7293
7294   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7295                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7296   eol_mnemonic_dos = build_string ("\\");
7297
7298   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7299                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7300   eol_mnemonic_mac = build_string ("/");
7301
7302   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7303                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7304   eol_mnemonic_undecided = build_string (":");
7305
7306   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7307                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7308   Venable_character_translation = Qt;
7309
7310   DEFVAR_LISP ("standard-translation-table-for-decode",
7311                &Vstandard_translation_table_for_decode,
7312                doc: /* Table for translating characters while decoding.  */);
7313   Vstandard_translation_table_for_decode = Qnil;
7314
7315   DEFVAR_LISP ("standard-translation-table-for-encode",
7316                &Vstandard_translation_table_for_encode,
7317                doc: /* Table for translating characters while encoding.  */);
7318   Vstandard_translation_table_for_encode = Qnil;
7319
7320   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7321                doc: /* Alist of charsets vs revision numbers.
7322 While encoding, if a charset (car part of an element) is found,
7323 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7324   Vcharset_revision_alist = Qnil;
7325
7326   DEFVAR_LISP ("default-process-coding-system",
7327                &Vdefault_process_coding_system,
7328                doc: /* Cons of coding systems used for process I/O by default.
7329 The car part is used for decoding a process output,
7330 the cdr part is used for encoding a text to be sent to a process.  */);
7331   Vdefault_process_coding_system = Qnil;
7332
7333   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7334                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7335 This is a vector of length 256.
7336 If Nth element is non-nil, the existence of code N in a file
7337 \(or output of subprocess) doesn't prevent it to be detected as
7338 a coding system of ISO 2022 variant which has a flag
7339 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7340 or reading output of a subprocess.
7341 Only 128th through 159th elements has a meaning.  */);
7342   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7343
7344   DEFVAR_LISP ("select-safe-coding-system-function",
7345                &Vselect_safe_coding_system_function,
7346                doc: /* Function to call to select safe coding system for encoding a text.
7347
7348 If set, this function is called to force a user to select a proper
7349 coding system which can encode the text in the case that a default
7350 coding system used in each operation can't encode the text.
7351
7352 The default value is `select-safe-coding-system' (which see).  */);
7353   Vselect_safe_coding_system_function = Qnil;
7354
7355   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
7356                doc: /* Char-table containing safe coding systems of each characters.
7357 Each element doesn't include such generic coding systems that can
7358 encode any characters.   They are in the first extra slot.  */);
7359   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
7360
7361   DEFVAR_BOOL ("inhibit-iso-escape-detection",
7362                &inhibit_iso_escape_detection,
7363                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7364
7365 By default, on reading a file, Emacs tries to detect how the text is
7366 encoded.  This code detection is sensitive to escape sequences.  If
7367 the sequence is valid as ISO2022, the code is determined as one of
7368 the ISO2022 encodings, and the file is decoded by the corresponding
7369 coding system (e.g. `iso-2022-7bit').
7370
7371 However, there may be a case that you want to read escape sequences in
7372 a file as is.  In such a case, you can set this variable to non-nil.
7373 Then, as the code detection ignores any escape sequences, no file is
7374 detected as encoded in some ISO2022 encoding.  The result is that all
7375 escape sequences become visible in a buffer.
7376
7377 The default value is nil, and it is strongly recommended not to change
7378 it.  That is because many Emacs Lisp source files that contain
7379 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7380 in Emacs's distribution, and they won't be decoded correctly on
7381 reading if you suppress escape sequence detection.
7382
7383 The other way to read escape sequences in a file without decoding is
7384 to explicitly specify some coding system that doesn't use ISO2022's
7385 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
7386   inhibit_iso_escape_detection = 0;
7387 }
7388
7389 char *
7390 emacs_strerror (error_number)
7391      int error_number;
7392 {
7393   char *str;
7394
7395   synchronize_system_messages_locale ();
7396   str = strerror (error_number);
7397
7398   if (! NILP (Vlocale_coding_system))
7399     {
7400       Lisp_Object dec = code_convert_string_norecord (build_string (str),
7401                                                       Vlocale_coding_system,
7402                                                       0);
7403       str = (char *) XSTRING (dec)->data;
7404     }
7405
7406   return str;
7407 }
7408
7409 #endif /* emacs */
7410