src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995,97,1998,2002,2003  Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001,2002,2003  Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 158
 159   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 160   internal multibyte format to CODING.  The resulting unibyte text
 161   goes to a place pointed to by DESTINATION, the length of which
 162   should not exceed DST_BYTES.
 163
 164   These functions set the information about original and encoded texts
 165   in the members `produced', `produced_char', `consumed', and
 166   `consumed_char' of the structure *CODING.  They also set the member
 167   `result' to one of CODING_FINISH_XXX indicating how the encoding
 168   finished.
 169
 170   DST_BYTES zero means that the source area and destination area are
 171   overlapped, which means that we can produce encoded text until it
 172   reaches at the head of the not-yet-encoded source text.
 173
 174   Below is a template for these functions.  */
 175 #if 0
 176 static void
 177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 178      struct coding_system *coding;
 179      unsigned char *source, *destination;
 180      int src_bytes, dst_bytes;
 181 {
 182   ...
 183 }
 184 #endif
 185
 186 /*** COMMONLY USED MACROS ***/
 187
 188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 189    get one, two, and three bytes from the source text respectively.
 190    If there are not enough bytes in the source, they jump to
 191    `label_end_of_loop'.  The caller should set variables `coding',
 192    `src' and `src_end' to appropriate pointer in advance.  These
 193    macros are called from decoding routines `decode_coding_XXX', thus
 194    it is assumed that the source text is unibyte.  */
 195
 196 #define ONE_MORE_BYTE(c1)                                       \
 197   do {                                                          \
 198     if (src >= src_end)                                         \
 199       {                                                         \
 200         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 201         goto label_end_of_loop;                                 \
 202       }                                                         \
 203     c1 = *src++;                                                \
 204   } while (0)
 205
 206 #define TWO_MORE_BYTES(c1, c2)                                  \
 207   do {                                                          \
 208     if (src + 1 >= src_end)                                     \
 209       {                                                         \
 210         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 211         goto label_end_of_loop;                                 \
 212       }                                                         \
 213     c1 = *src++;                                                \
 214     c2 = *src++;                                                \
 215   } while (0)
 216
 217
 218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 219    form if MULTIBYTEP is nonzero.  */
 220
 221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 222   do {                                                          \
 223     if (src >= src_end)                                         \
 224       {                                                         \
 225         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 226         goto label_end_of_loop;                                 \
 227       }                                                         \
 228     c1 = *src++;                                                \
 229     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 230       c1 = *src++ - 0x20;                                       \
 231   } while (0)
 232
 233 /* Set C to the next character at the source text pointed by `src'.
 234    If there are not enough characters in the source, jump to
 235    `label_end_of_loop'.  The caller should set variables `coding'
 236    `src', `src_end', and `translation_table' to appropriate pointers
 237    in advance.  This macro is used in encoding routines
 238    `encode_coding_XXX', thus it assumes that the source text is in
 239    multibyte form except for 8-bit characters.  8-bit characters are
 240    in multibyte form if coding->src_multibyte is nonzero, else they
 241    are represented by a single byte.  */
 242
 243 #define ONE_MORE_CHAR(c)                                        \
 244   do {                                                          \
 245     int len = src_end - src;                                    \
 246     int bytes;                                                  \
 247     if (len <= 0)                                               \
 248       {                                                         \
 249         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 250         goto label_end_of_loop;                                 \
 251       }                                                         \
 252     if (coding->src_multibyte                                   \
 253         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 254       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 255     else                                                        \
 256       c = *src, bytes = 1;                                      \
 257     if (!NILP (translation_table))                              \
 258       c = translate_char (translation_table, c, -1, 0, 0);      \
 259     src += bytes;                                               \
 260   } while (0)
 261
 262
 263 /* Produce a multibyte form of character C to `dst'.  Jump to
 264    `label_end_of_loop' if there's not enough space at `dst'.
 265
 266    If we are now in the middle of a composition sequence, the decoded
 267    character may be ALTCHAR (for the current composition).  In that
 268    case, the character goes to coding->cmp_data->data instead of
 269    `dst'.
 270
 271    This macro is used in decoding routines.  */
 272
 273 #define EMIT_CHAR(c)                                                    \
 274   do {                                                                  \
 275     if (! COMPOSING_P (coding)                                          \
 276         || coding->composing == COMPOSITION_RELATIVE                    \
 277         || coding->composing == COMPOSITION_WITH_RULE)                  \
 278       {                                                                 \
 279         int bytes = CHAR_BYTES (c);                                     \
 280         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 281           {                                                             \
 282             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 283             goto label_end_of_loop;                                     \
 284           }                                                             \
 285         dst += CHAR_STRING (c, dst);                                    \
 286         coding->produced_char++;                                        \
 287       }                                                                 \
 288                                                                         \
 289     if (COMPOSING_P (coding)                                            \
 290         && coding->composing != COMPOSITION_RELATIVE)                   \
 291       {                                                                 \
 292         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 293         coding->composition_rule_follows                                \
 294           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 295       }                                                                 \
 296   } while (0)
 297
 298
 299 #define EMIT_ONE_BYTE(c)                                        \
 300   do {                                                          \
 301     if (dst >= (dst_bytes ? dst_end : src))                     \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     *dst++ = c;                                                 \
 307   } while (0)
 308
 309 #define EMIT_TWO_BYTES(c1, c2)                                  \
 310   do {                                                          \
 311     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 312       {                                                         \
 313         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 314         goto label_end_of_loop;                                 \
 315       }                                                         \
 316     *dst++ = c1, *dst++ = c2;                                   \
 317   } while (0)
 318
 319 #define EMIT_BYTES(from, to)                                    \
 320   do {                                                          \
 321     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 322       {                                                         \
 323         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 324         goto label_end_of_loop;                                 \
 325       }                                                         \
 326     while (from < to)                                           \
 327       *dst++ = *from++;                                         \
 328   } while (0)
 329
 330 \f
 331 /*** 1. Preamble ***/
 332
 333 #ifdef emacs
 334 #include <config.h>
 335 #endif
 336
 337 #include <stdio.h>
 338
 339 #ifdef emacs
 340
 341 #include "lisp.h"
 342 #include "buffer.h"
 343 #include "charset.h"
 344 #include "composite.h"
 345 #include "ccl.h"
 346 #include "coding.h"
 347 #include "window.h"
 348 #include "intervals.h"
 349
 350 #else  /* not emacs */
 351
 352 #include "mulelib.h"
 353
 354 #endif /* not emacs */
 355
 356 Lisp_Object Qcoding_system, Qeol_type;
 357 Lisp_Object Qbuffer_file_coding_system;
 358 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 359 Lisp_Object Qno_conversion, Qundecided;
 360 Lisp_Object Qcoding_system_history;
 361 Lisp_Object Qsafe_chars;
 362 Lisp_Object Qvalid_codes;
 363
 364 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 365 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 366 Lisp_Object Qstart_process, Qopen_network_stream;
 367 Lisp_Object Qtarget_idx;
 368
 369 Lisp_Object Vselect_safe_coding_system_function;
 370
 371 int coding_system_require_warning;
 372
 373 /* Mnemonic string for each format of end-of-line.  */
 374 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 375 /* Mnemonic string to indicate format of end-of-line is not yet
 376    decided.  */
 377 Lisp_Object eol_mnemonic_undecided;
 378
 379 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 380    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 381 int system_eol_type;
 382
 383 #ifdef emacs
 384
 385 /* Information about which coding system is safe for which chars.
 386    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 387
 388    GENERIC-LIST is a list of generic coding systems which can encode
 389    any characters.
 390
 391    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 392    corresponding char table that contains safe chars.  */
 393 Lisp_Object Vcoding_system_safe_chars;
 394
 395 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 396
 397 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 398
 399 /* Coding system emacs-mule and raw-text are for converting only
 400    end-of-line format.  */
 401 Lisp_Object Qemacs_mule, Qraw_text;
 402
 403 Lisp_Object Qutf_8;
 404
 405 /* Coding-systems are handed between Emacs Lisp programs and C internal
 406    routines by the following three variables.  */
 407 /* Coding-system for reading files and receiving data from process.  */
 408 Lisp_Object Vcoding_system_for_read;
 409 /* Coding-system for writing files and sending data to process.  */
 410 Lisp_Object Vcoding_system_for_write;
 411 /* Coding-system actually used in the latest I/O.  */
 412 Lisp_Object Vlast_coding_system_used;
 413
 414 /* A vector of length 256 which contains information about special
 415    Latin codes (especially for dealing with Microsoft codes).  */
 416 Lisp_Object Vlatin_extra_code_table;
 417
 418 /* Flag to inhibit code conversion of end-of-line format.  */
 419 int inhibit_eol_conversion;
 420
 421 /* Flag to inhibit ISO2022 escape sequence detection.  */
 422 int inhibit_iso_escape_detection;
 423
 424 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 425 int inherit_process_coding_system;
 426
 427 /* Coding system to be used to encode text for terminal display.  */
 428 struct coding_system terminal_coding;
 429
 430 /* Coding system to be used to encode text for terminal display when
 431    terminal coding system is nil.  */
 432 struct coding_system safe_terminal_coding;
 433
 434 /* Coding system of what is sent from terminal keyboard.  */
 435 struct coding_system keyboard_coding;
 436
 437 /* Default coding system to be used to write a file.  */
 438 struct coding_system default_buffer_file_coding;
 439
 440 Lisp_Object Vfile_coding_system_alist;
 441 Lisp_Object Vprocess_coding_system_alist;
 442 Lisp_Object Vnetwork_coding_system_alist;
 443
 444 Lisp_Object Vlocale_coding_system;
 445
 446 #endif /* emacs */
 447
 448 Lisp_Object Qcoding_category, Qcoding_category_index;
 449
 450 /* List of symbols `coding-category-xxx' ordered by priority.  */
 451 Lisp_Object Vcoding_category_list;
 452
 453 /* Table of coding categories (Lisp symbols).  */
 454 Lisp_Object Vcoding_category_table;
 455
 456 /* Table of names of symbol for each coding-category.  */
 457 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 458   "coding-category-emacs-mule",
 459   "coding-category-sjis",
 460   "coding-category-iso-7",
 461   "coding-category-iso-7-tight",
 462   "coding-category-iso-8-1",
 463   "coding-category-iso-8-2",
 464   "coding-category-iso-7-else",
 465   "coding-category-iso-8-else",
 466   "coding-category-ccl",
 467   "coding-category-big5",
 468   "coding-category-utf-8",
 469   "coding-category-utf-16-be",
 470   "coding-category-utf-16-le",
 471   "coding-category-raw-text",
 472   "coding-category-binary"
 473 };
 474
 475 /* Table of pointers to coding systems corresponding to each coding
 476    categories.  */
 477 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 478
 479 /* Table of coding category masks.  Nth element is a mask for a coding
 480    category of which priority is Nth.  */
 481 static
 482 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 483
 484 /* Flag to tell if we look up translation table on character code
 485    conversion.  */
 486 Lisp_Object Venable_character_translation;
 487 /* Standard translation table to look up on decoding (reading).  */
 488 Lisp_Object Vstandard_translation_table_for_decode;
 489 /* Standard translation table to look up on encoding (writing).  */
 490 Lisp_Object Vstandard_translation_table_for_encode;
 491
 492 Lisp_Object Qtranslation_table;
 493 Lisp_Object Qtranslation_table_id;
 494 Lisp_Object Qtranslation_table_for_decode;
 495 Lisp_Object Qtranslation_table_for_encode;
 496
 497 /* Alist of charsets vs revision number.  */
 498 Lisp_Object Vcharset_revision_alist;
 499
 500 /* Default coding systems used for process I/O.  */
 501 Lisp_Object Vdefault_process_coding_system;
 502
 503 /* Char table for translating Quail and self-inserting input.  */
 504 Lisp_Object Vtranslation_table_for_input;
 505
 506 /* Global flag to tell that we can't call post-read-conversion and
 507    pre-write-conversion functions.  Usually the value is zero, but it
 508    is set to 1 temporarily while such functions are running.  This is
 509    to avoid infinite recursive call.  */
 510 static int inhibit_pre_post_conversion;
 511
 512 Lisp_Object Qchar_coding_system;
 513
 514 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 515    its validity.  */
 516
 517 Lisp_Object
 518 coding_safe_chars (coding_system)
 519      Lisp_Object coding_system;
 520 {
 521   Lisp_Object coding_spec, plist, safe_chars;
 522
 523   coding_spec = Fget (coding_system, Qcoding_system);
 524   plist = XVECTOR (coding_spec)->contents[3];
 525   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 526   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 527 }
 528
 529 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 530   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 531
 532 \f
 533 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 534
 535 /* Emacs' internal format for representation of multiple character
 536    sets is a kind of multi-byte encoding, i.e. characters are
 537    represented by variable-length sequences of one-byte codes.
 538
 539    ASCII characters and control characters (e.g. `tab', `newline') are
 540    represented by one-byte sequences which are their ASCII codes, in
 541    the range 0x00 through 0x7F.
 542
 543    8-bit characters of the range 0x80..0x9F are represented by
 544    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 545    code + 0x20).
 546
 547    8-bit characters of the range 0xA0..0xFF are represented by
 548    one-byte sequences which are their 8-bit code.
 549
 550    The other characters are represented by a sequence of `base
 551    leading-code', optional `extended leading-code', and one or two
 552    `position-code's.  The length of the sequence is determined by the
 553    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 554    whereas extended leading-code and position-code take the range 0xA0
 555    through 0xFF.  See `charset.h' for more details about leading-code
 556    and position-code.
 557
 558    --- CODE RANGE of Emacs' internal format ---
 559    character set        range
 560    -------------        -----
 561    ascii                0x00..0x7F
 562    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 563    eight-bit-graphic    0xA0..0xBF
 564    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 565    ---------------------------------------------
 566
 567    As this is the internal character representation, the format is
 568    usually not used externally (i.e. in a file or in a data sent to a
 569    process).  But, it is possible to have a text externally in this
 570    format (i.e. by encoding by the coding system `emacs-mule').
 571
 572    In that case, a sequence of one-byte codes has a slightly different
 573    form.
 574
 575    Firstly, all characters in eight-bit-control are represented by
 576    one-byte sequences which are their 8-bit code.
 577
 578    Next, character composition data are represented by the byte
 579    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 580    where,
 581         METHOD is 0xF0 plus one of composition method (enum
 582         composition_method),
 583
 584         BYTES is 0xA0 plus the byte length of these composition data,
 585
 586         CHARS is 0xA0 plus the number of characters composed by these
 587         data,
 588
 589         COMPONENTs are characters of multibyte form or composition
 590         rules encoded by two-byte of ASCII codes.
 591
 592    In addition, for backward compatibility, the following formats are
 593    also recognized as composition data on decoding.
 594
 595    0x80 MSEQ ...
 596    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 597
 598    Here,
 599         MSEQ is a multibyte form but in these special format:
 600           ASCII: 0xA0 ASCII_CODE+0x80,
 601           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 602         RULE is a one byte code of the range 0xA0..0xF0 that
 603         represents a composition rule.
 604   */
 605
 606 enum emacs_code_class_type emacs_code_class[256];
 607
 608 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 609    Check if a text is encoded in Emacs' internal format.  If it is,
 610    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 611
 612 static int
 613 detect_coding_emacs_mule (src, src_end, multibytep)
 614       unsigned char *src, *src_end;
 615       int multibytep;
 616 {
 617   unsigned char c;
 618   int composing = 0;
 619   /* Dummy for ONE_MORE_BYTE.  */
 620   struct coding_system dummy_coding;
 621   struct coding_system *coding = &dummy_coding;
 622
 623   while (1)
 624     {
 625       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 626
 627       if (composing)
 628         {
 629           if (c < 0xA0)
 630             composing = 0;
 631           else if (c == 0xA0)
 632             {
 633               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 634               c &= 0x7F;
 635             }
 636           else
 637             c -= 0x20;
 638         }
 639
 640       if (c < 0x20)
 641         {
 642           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 643             return 0;
 644         }
 645       else if (c >= 0x80 && c < 0xA0)
 646         {
 647           if (c == 0x80)
 648             /* Old leading code for a composite character.  */
 649             composing = 1;
 650           else
 651             {
 652               unsigned char *src_base = src - 1;
 653               int bytes;
 654
 655               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 656                                                bytes))
 657                 return 0;
 658               src = src_base + bytes;
 659             }
 660         }
 661     }
 662  label_end_of_loop:
 663   return CODING_CATEGORY_MASK_EMACS_MULE;
 664 }
 665
 666
 667 /* Record the starting position START and METHOD of one composition.  */
 668
 669 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 670   do {                                                          \
 671     struct composition_data *cmp_data = coding->cmp_data;       \
 672     int *data = cmp_data->data + cmp_data->used;                \
 673     coding->cmp_data_start = cmp_data->used;                    \
 674     data[0] = -1;                                               \
 675     data[1] = cmp_data->char_offset + start;                    \
 676     data[3] = (int) method;                                     \
 677     cmp_data->used += 4;                                        \
 678   } while (0)
 679
 680 /* Record the ending position END of the current composition.  */
 681
 682 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 683   do {                                                          \
 684     struct composition_data *cmp_data = coding->cmp_data;       \
 685     int *data = cmp_data->data + coding->cmp_data_start;        \
 686     data[0] = cmp_data->used - coding->cmp_data_start;          \
 687     data[2] = cmp_data->char_offset + end;                      \
 688   } while (0)
 689
 690 /* Record one COMPONENT (alternate character or composition rule).  */
 691
 692 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 693   do {                                                                  \
 694     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 695     if (coding->cmp_data->used - coding->cmp_data_start                 \
 696         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 697       {                                                                 \
 698         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 699         coding->composing = COMPOSITION_NO;                             \
 700       }                                                                 \
 701   } while (0)
 702
 703
 704 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 705    is not less than SRC_END, return -1 without incrementing Src.  */
 706
 707 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 708
 709
 710 /* Decode a character represented as a component of composition
 711    sequence of Emacs 20 style at SRC.  Set C to that character, store
 712    its multibyte form sequence at P, and set P to the end of that
 713    sequence.  If no valid character is found, set C to -1.  */
 714
 715 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 716   do {                                                          \
 717     int bytes;                                                  \
 718                                                                 \
 719     c = SAFE_ONE_MORE_BYTE ();                                  \
 720     if (c < 0)                                                  \
 721       break;                                                    \
 722     if (CHAR_HEAD_P (c))                                        \
 723       c = -1;                                                   \
 724     else if (c == 0xA0)                                         \
 725       {                                                         \
 726         c = SAFE_ONE_MORE_BYTE ();                              \
 727         if (c < 0xA0)                                           \
 728           c = -1;                                               \
 729         else                                                    \
 730           {                                                     \
 731             c -= 0xA0;                                          \
 732             *p++ = c;                                           \
 733           }                                                     \
 734       }                                                         \
 735     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 736       {                                                         \
 737         unsigned char *p0 = p;                                  \
 738                                                                 \
 739         c -= 0x20;                                              \
 740         *p++ = c;                                               \
 741         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 742         while (--bytes)                                         \
 743           {                                                     \
 744             c = SAFE_ONE_MORE_BYTE ();                          \
 745             if (c < 0)                                          \
 746               break;                                            \
 747             *p++ = c;                                           \
 748           }                                                     \
 749         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 750             || (coding->flags /* We are recovering a file.  */  \
 751                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 752                 && ! CHAR_HEAD_P (p0[1])))                      \
 753           c = STRING_CHAR (p0, bytes);                          \
 754         else                                                    \
 755           c = -1;                                               \
 756       }                                                         \
 757     else                                                        \
 758       c = -1;                                                   \
 759   } while (0)
 760
 761
 762 /* Decode a composition rule represented as a component of composition
 763    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 764    valid rule is found, set C to -1.  */
 765
 766 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 767   do {                                                  \
 768     c = SAFE_ONE_MORE_BYTE ();                          \
 769     c -= 0xA0;                                          \
 770     if (c < 0 || c >= 81)                               \
 771       c = -1;                                           \
 772     else                                                \
 773       {                                                 \
 774         gref = c / 9, nref = c % 9;                     \
 775         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 776       }                                                 \
 777   } while (0)
 778
 779
 780 /* Decode composition sequence encoded by `emacs-mule' at the source
 781    pointed by SRC.  SRC_END is the end of source.  Store information
 782    of the composition in CODING->cmp_data.
 783
 784    For backward compatibility, decode also a composition sequence of
 785    Emacs 20 style.  In that case, the composition sequence contains
 786    characters that should be extracted into a buffer or string.  Store
 787    those characters at *DESTINATION in multibyte form.
 788
 789    If we encounter an invalid byte sequence, return 0.
 790    If we encounter an insufficient source or destination, or
 791    insufficient space in CODING->cmp_data, return 1.
 792    Otherwise, return consumed bytes in the source.
 793
 794 */
 795 static INLINE int
 796 decode_composition_emacs_mule (coding, src, src_end,
 797                                destination, dst_end, dst_bytes)
 798      struct coding_system *coding;
 799      unsigned char *src, *src_end, **destination, *dst_end;
 800      int dst_bytes;
 801 {
 802   unsigned char *dst = *destination;
 803   int method, data_len, nchars;
 804   unsigned char *src_base = src++;
 805   /* Store components of composition.  */
 806   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 807   int ncomponent;
 808   /* Store multibyte form of characters to be composed.  This is for
 809      Emacs 20 style composition sequence.  */
 810   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 811   unsigned char *bufp = buf;
 812   int c, i, gref, nref;
 813
 814   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 815       >= COMPOSITION_DATA_SIZE)
 816     {
 817       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 818       return -1;
 819     }
 820
 821   ONE_MORE_BYTE (c);
 822   if (c - 0xF0 >= COMPOSITION_RELATIVE
 823            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 824     {
 825       int with_rule;
 826
 827       method = c - 0xF0;
 828       with_rule = (method == COMPOSITION_WITH_RULE
 829                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 830       ONE_MORE_BYTE (c);
 831       data_len = c - 0xA0;
 832       if (data_len < 4
 833           || src_base + data_len > src_end)
 834         return 0;
 835       ONE_MORE_BYTE (c);
 836       nchars = c - 0xA0;
 837       if (c < 1)
 838         return 0;
 839       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 840         {
 841           /* If it is longer than this, it can't be valid.  */
 842           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 843             return 0;
 844
 845           if (ncomponent % 2 && with_rule)
 846             {
 847               ONE_MORE_BYTE (gref);
 848               gref -= 32;
 849               ONE_MORE_BYTE (nref);
 850               nref -= 32;
 851               c = COMPOSITION_ENCODE_RULE (gref, nref);
 852             }
 853           else
 854             {
 855               int bytes;
 856               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 857                   || (coding->flags /* We are recovering a file.  */
 858                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 859                       && ! CHAR_HEAD_P (src[1])))
 860                 c = STRING_CHAR (src, bytes);
 861               else
 862                 c = *src, bytes = 1;
 863               src += bytes;
 864             }
 865           component[ncomponent] = c;
 866         }
 867     }
 868   else
 869     {
 870       /* This may be an old Emacs 20 style format.  See the comment at
 871          the section 2 of this file.  */
 872       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 873       if (src == src_end
 874           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 875         goto label_end_of_loop;
 876
 877       src_end = src;
 878       src = src_base + 1;
 879       if (c < 0xC0)
 880         {
 881           method = COMPOSITION_RELATIVE;
 882           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 883             {
 884               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 885               if (c < 0)
 886                 break;
 887               component[ncomponent++] = c;
 888             }
 889           if (ncomponent < 2)
 890             return 0;
 891           nchars = ncomponent;
 892         }
 893       else if (c == 0xFF)
 894         {
 895           method = COMPOSITION_WITH_RULE;
 896           src++;
 897           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 898           if (c < 0)
 899             return 0;
 900           component[0] = c;
 901           for (ncomponent = 1;
 902                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 903             {
 904               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 905               if (c < 0)
 906                 break;
 907               component[ncomponent++] = c;
 908               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 909               if (c < 0)
 910                 break;
 911               component[ncomponent++] = c;
 912             }
 913           if (ncomponent < 3)
 914             return 0;
 915           nchars = (ncomponent + 1) / 2;
 916         }
 917       else
 918         return 0;
 919     }
 920
 921   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 922     {
 923       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 924       for (i = 0; i < ncomponent; i++)
 925         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 926       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 927       if (buf < bufp)
 928         {
 929           unsigned char *p = buf;
 930           EMIT_BYTES (p, bufp);
 931           *destination += bufp - buf;
 932           coding->produced_char += nchars;
 933         }
 934       return (src - src_base);
 935     }
 936  label_end_of_loop:
 937   return -1;
 938 }
 939
 940 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 941
 942 static void
 943 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 944      struct coding_system *coding;
 945      unsigned char *source, *destination;
 946      int src_bytes, dst_bytes;
 947 {
 948   unsigned char *src = source;
 949   unsigned char *src_end = source + src_bytes;
 950   unsigned char *dst = destination;
 951   unsigned char *dst_end = destination + dst_bytes;
 952   /* SRC_BASE remembers the start position in source in each loop.
 953      The loop will be exited when there's not enough source code, or
 954      when there's not enough destination area to produce a
 955      character.  */
 956   unsigned char *src_base;
 957
 958   coding->produced_char = 0;
 959   while ((src_base = src) < src_end)
 960     {
 961       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 962       int bytes;
 963
 964       if (*src == '\r')
 965         {
 966           int c = *src++;
 967
 968           if (coding->eol_type == CODING_EOL_CR)
 969             c = '\n';
 970           else if (coding->eol_type == CODING_EOL_CRLF)
 971             {
 972               ONE_MORE_BYTE (c);
 973               if (c != '\n')
 974                 {
 975                   src--;
 976                   c = '\r';
 977                 }
 978             }
 979           *dst++ = c;
 980           coding->produced_char++;
 981           continue;
 982         }
 983       else if (*src == '\n')
 984         {
 985           if ((coding->eol_type == CODING_EOL_CR
 986                || coding->eol_type == CODING_EOL_CRLF)
 987               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 988             {
 989               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 990               goto label_end_of_loop;
 991             }
 992           *dst++ = *src++;
 993           coding->produced_char++;
 994           continue;
 995         }
 996       else if (*src == 0x80 && coding->cmp_data)
 997         {
 998           /* Start of composition data.  */
 999           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1000                                                          &dst, dst_end,
1001                                                          dst_bytes);
1002           if (consumed < 0)
1003             goto label_end_of_loop;
1004           else if (consumed > 0)
1005             {
1006               src += consumed;
1007               continue;
1008             }
1009           bytes = CHAR_STRING (*src, tmp);
1010           p = tmp;
1011           src++;
1012         }
1013       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1014                || (coding->flags /* We are recovering a file.  */
1015                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1016                    && ! CHAR_HEAD_P (src[1])))
1017         {
1018           p = src;
1019           src += bytes;
1020         }
1021       else
1022         {
1023           bytes = CHAR_STRING (*src, tmp);
1024           p = tmp;
1025           src++;
1026         }
1027       if (dst + bytes >= (dst_bytes ? dst_end : src))
1028         {
1029           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1030           break;
1031         }
1032       while (bytes--) *dst++ = *p++;
1033       coding->produced_char++;
1034     }
1035  label_end_of_loop:
1036   coding->consumed = coding->consumed_char = src_base - source;
1037   coding->produced = dst - destination;
1038 }
1039
1040
1041 /* Encode composition data stored at DATA into a special byte sequence
1042    starting by 0x80.  Update CODING->cmp_data_start and maybe
1043    CODING->cmp_data for the next call.  */
1044
1045 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1046   do {                                                                  \
1047     unsigned char buf[1024], *p0 = buf, *p;                             \
1048     int len = data[0];                                                  \
1049     int i;                                                              \
1050                                                                         \
1051     buf[0] = 0x80;                                                      \
1052     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1053     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1054     p = buf + 4;                                                        \
1055     if (data[3] == COMPOSITION_WITH_RULE                                \
1056         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1057       {                                                                 \
1058         p += CHAR_STRING (data[4], p);                                  \
1059         for (i = 5; i < len; i += 2)                                    \
1060           {                                                             \
1061             int gref, nref;                                             \
1062              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1063             *p++ = 0x20 + gref;                                         \
1064             *p++ = 0x20 + nref;                                         \
1065             p += CHAR_STRING (data[i + 1], p);                          \
1066           }                                                             \
1067       }                                                                 \
1068     else                                                                \
1069       {                                                                 \
1070         for (i = 4; i < len; i++)                                       \
1071           p += CHAR_STRING (data[i], p);                                \
1072       }                                                                 \
1073     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1074                                                                         \
1075     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1076       {                                                                 \
1077         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1078         goto label_end_of_loop;                                         \
1079       }                                                                 \
1080     while (p0 < p)                                                      \
1081       *dst++ = *p0++;                                                   \
1082     coding->cmp_data_start += data[0];                                  \
1083     if (coding->cmp_data_start == coding->cmp_data->used                \
1084         && coding->cmp_data->next)                                      \
1085       {                                                                 \
1086         coding->cmp_data = coding->cmp_data->next;                      \
1087         coding->cmp_data_start = 0;                                     \
1088       }                                                                 \
1089   } while (0)
1090
1091
1092 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1093                             unsigned char *, int, int));
1094
1095 static void
1096 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1097      struct coding_system *coding;
1098      unsigned char *source, *destination;
1099      int src_bytes, dst_bytes;
1100 {
1101   unsigned char *src = source;
1102   unsigned char *src_end = source + src_bytes;
1103   unsigned char *dst = destination;
1104   unsigned char *dst_end = destination + dst_bytes;
1105   unsigned char *src_base;
1106   int c;
1107   int char_offset;
1108   int *data;
1109
1110   Lisp_Object translation_table;
1111
1112   translation_table = Qnil;
1113
1114   /* Optimization for the case that there's no composition.  */
1115   if (!coding->cmp_data || coding->cmp_data->used == 0)
1116     {
1117       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1118       return;
1119     }
1120
1121   char_offset = coding->cmp_data->char_offset;
1122   data = coding->cmp_data->data + coding->cmp_data_start;
1123   while (1)
1124     {
1125       src_base = src;
1126
1127       /* If SRC starts a composition, encode the information about the
1128          composition in advance.  */
1129       if (coding->cmp_data_start < coding->cmp_data->used
1130           && char_offset + coding->consumed_char == data[1])
1131         {
1132           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1133           char_offset = coding->cmp_data->char_offset;
1134           data = coding->cmp_data->data + coding->cmp_data_start;
1135         }
1136
1137       ONE_MORE_CHAR (c);
1138       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1139                         || coding->eol_type == CODING_EOL_CR))
1140         {
1141           if (coding->eol_type == CODING_EOL_CRLF)
1142             EMIT_TWO_BYTES ('\r', c);
1143           else
1144             EMIT_ONE_BYTE ('\r');
1145         }
1146       else if (SINGLE_BYTE_CHAR_P (c))
1147         {
1148           if (coding->flags && ! ASCII_BYTE_P (c))
1149             {
1150               /* As we are auto saving, retain the multibyte form for
1151                  8-bit chars.  */
1152               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1153               int bytes = CHAR_STRING (c, buf);
1154
1155               if (bytes == 1)
1156                 EMIT_ONE_BYTE (buf[0]);
1157               else
1158                 EMIT_TWO_BYTES (buf[0], buf[1]);
1159             }
1160           else
1161             EMIT_ONE_BYTE (c);
1162         }
1163       else
1164         EMIT_BYTES (src_base, src);
1165       coding->consumed_char++;
1166     }
1167  label_end_of_loop:
1168   coding->consumed = src_base - source;
1169   coding->produced = coding->produced_char = dst - destination;
1170   return;
1171 }
1172
1173 \f
1174 /*** 3. ISO2022 handlers ***/
1175
1176 /* The following note describes the coding system ISO2022 briefly.
1177    Since the intention of this note is to help understand the
1178    functions in this file, some parts are NOT ACCURATE or are OVERLY
1179    SIMPLIFIED.  For thorough understanding, please refer to the
1180    original document of ISO2022.  This is equivalent to the standard
1181    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1182
1183    ISO2022 provides many mechanisms to encode several character sets
1184    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1185    is encoded using bytes less than 128.  This may make the encoded
1186    text a little bit longer, but the text passes more easily through
1187    several types of gateway, some of which strip off the MSB (Most
1188    Significant Bit).
1189
1190    There are two kinds of character sets: control character sets and
1191    graphic character sets.  The former contain control characters such
1192    as `newline' and `escape' to provide control functions (control
1193    functions are also provided by escape sequences).  The latter
1194    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1195    two control character sets and many graphic character sets.
1196
1197    Graphic character sets are classified into one of the following
1198    four classes, according to the number of bytes (DIMENSION) and
1199    number of characters in one dimension (CHARS) of the set:
1200    - DIMENSION1_CHARS94
1201    - DIMENSION1_CHARS96
1202    - DIMENSION2_CHARS94
1203    - DIMENSION2_CHARS96
1204
1205    In addition, each character set is assigned an identification tag,
1206    unique for each set, called the "final character" (denoted as <F>
1207    hereafter).  The <F> of each character set is decided by ECMA(*)
1208    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1209    (0x30..0x3F are for private use only).
1210
1211    Note (*): ECMA = European Computer Manufacturers Association
1212
1213    Here are examples of graphic character sets [NAME(<F>)]:
1214         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1215         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1216         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1217         o DIMENSION2_CHARS96 -- none for the moment
1218
1219    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1220         C0 [0x00..0x1F] -- control character plane 0
1221         GL [0x20..0x7F] -- graphic character plane 0
1222         C1 [0x80..0x9F] -- control character plane 1
1223         GR [0xA0..0xFF] -- graphic character plane 1
1224
1225    A control character set is directly designated and invoked to C0 or
1226    C1 by an escape sequence.  The most common case is that:
1227    - ISO646's  control character set is designated/invoked to C0, and
1228    - ISO6429's control character set is designated/invoked to C1,
1229    and usually these designations/invocations are omitted in encoded
1230    text.  In a 7-bit environment, only C0 can be used, and a control
1231    character for C1 is encoded by an appropriate escape sequence to
1232    fit into the environment.  All control characters for C1 are
1233    defined to have corresponding escape sequences.
1234
1235    A graphic character set is at first designated to one of four
1236    graphic registers (G0 through G3), then these graphic registers are
1237    invoked to GL or GR.  These designations and invocations can be
1238    done independently.  The most common case is that G0 is invoked to
1239    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1240    these invocations and designations are omitted in encoded text.
1241    In a 7-bit environment, only GL can be used.
1242
1243    When a graphic character set of CHARS94 is invoked to GL, codes
1244    0x20 and 0x7F of the GL area work as control characters SPACE and
1245    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1246    be used.
1247
1248    There are two ways of invocation: locking-shift and single-shift.
1249    With locking-shift, the invocation lasts until the next different
1250    invocation, whereas with single-shift, the invocation affects the
1251    following character only and doesn't affect the locking-shift
1252    state.  Invocations are done by the following control characters or
1253    escape sequences:
1254
1255    ----------------------------------------------------------------------
1256    abbrev  function                  cntrl escape seq   description
1257    ----------------------------------------------------------------------
1258    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1259    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1260    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1261    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1262    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1263    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1264    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1265    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1266    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1267    ----------------------------------------------------------------------
1268    (*) These are not used by any known coding system.
1269
1270    Control characters for these functions are defined by macros
1271    ISO_CODE_XXX in `coding.h'.
1272
1273    Designations are done by the following escape sequences:
1274    ----------------------------------------------------------------------
1275    escape sequence      description
1276    ----------------------------------------------------------------------
1277    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1278    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1279    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1280    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1281    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1282    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1283    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1284    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1285    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1286    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1287    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1288    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1289    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1290    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1291    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1292    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1293    ----------------------------------------------------------------------
1294
1295    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1296    of dimension 1, chars 94, and final character <F>, etc...
1297
1298    Note (*): Although these designations are not allowed in ISO2022,
1299    Emacs accepts them on decoding, and produces them on encoding
1300    CHARS96 character sets in a coding system which is characterized as
1301    7-bit environment, non-locking-shift, and non-single-shift.
1302
1303    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1304    '(' can be omitted.  We refer to this as "short-form" hereafter.
1305
1306    Now you may notice that there are a lot of ways of encoding the
1307    same multilingual text in ISO2022.  Actually, there exist many
1308    coding systems such as Compound Text (used in X11's inter client
1309    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1310    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1311    localized platforms), and all of these are variants of ISO2022.
1312
1313    In addition to the above, Emacs handles two more kinds of escape
1314    sequences: ISO6429's direction specification and Emacs' private
1315    sequence for specifying character composition.
1316
1317    ISO6429's direction specification takes the following form:
1318         o CSI ']'      -- end of the current direction
1319         o CSI '0' ']'  -- end of the current direction
1320         o CSI '1' ']'  -- start of left-to-right text
1321         o CSI '2' ']'  -- start of right-to-left text
1322    The control character CSI (0x9B: control sequence introducer) is
1323    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1324
1325    Character composition specification takes the following form:
1326         o ESC '0' -- start relative composition
1327         o ESC '1' -- end composition
1328         o ESC '2' -- start rule-base composition (*)
1329         o ESC '3' -- start relative composition with alternate chars  (**)
1330         o ESC '4' -- start rule-base composition with alternate chars  (**)
1331   Since these are not standard escape sequences of any ISO standard,
1332   the use of them with these meanings is restricted to Emacs only.
1333
1334   (*) This form is used only in Emacs 20.5 and older versions,
1335   but the newer versions can safely decode it.
1336   (**) This form is used only in Emacs 21.1 and newer versions,
1337   and the older versions can't decode it.
1338
1339   Here's a list of example usages of these composition escape
1340   sequences (categorized by `enum composition_method').
1341
1342   COMPOSITION_RELATIVE:
1343         ESC 0 CHAR [ CHAR ] ESC 1
1344   COMPOSITION_WITH_RULE:
1345         ESC 2 CHAR [ RULE CHAR ] ESC 1
1346   COMPOSITION_WITH_ALTCHARS:
1347         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1348   COMPOSITION_WITH_RULE_ALTCHARS:
1349         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1350
1351 enum iso_code_class_type iso_code_class[256];
1352
1353 #define CHARSET_OK(idx, charset, c)                                     \
1354   (coding_system_table[idx]                                             \
1355    && (charset == CHARSET_ASCII                                         \
1356        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1357            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1358    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1359                                               charset)                  \
1360        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1361
1362 #define SHIFT_OUT_OK(idx) \
1363   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1364
1365 #define COMPOSITION_OK(idx)     \
1366   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1367
1368 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1369    Check if a text is encoded in ISO2022.  If it is, return an
1370    integer in which appropriate flag bits any of:
1371         CODING_CATEGORY_MASK_ISO_7
1372         CODING_CATEGORY_MASK_ISO_7_TIGHT
1373         CODING_CATEGORY_MASK_ISO_8_1
1374         CODING_CATEGORY_MASK_ISO_8_2
1375         CODING_CATEGORY_MASK_ISO_7_ELSE
1376         CODING_CATEGORY_MASK_ISO_8_ELSE
1377    are set.  If a code which should never appear in ISO2022 is found,
1378    returns 0.  */
1379
1380 static int
1381 detect_coding_iso2022 (src, src_end, multibytep)
1382      unsigned char *src, *src_end;
1383      int multibytep;
1384 {
1385   int mask = CODING_CATEGORY_MASK_ISO;
1386   int mask_found = 0;
1387   int reg[4], shift_out = 0, single_shifting = 0;
1388   int c, c1, charset;
1389   /* Dummy for ONE_MORE_BYTE.  */
1390   struct coding_system dummy_coding;
1391   struct coding_system *coding = &dummy_coding;
1392   Lisp_Object safe_chars;
1393
1394   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1395   while (mask && src < src_end)
1396     {
1397       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1398     retry:
1399       switch (c)
1400         {
1401         case ISO_CODE_ESC:
1402           if (inhibit_iso_escape_detection)
1403             break;
1404           single_shifting = 0;
1405           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1406           if (c >= '(' && c <= '/')
1407             {
1408               /* Designation sequence for a charset of dimension 1.  */
1409               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1410               if (c1 < ' ' || c1 >= 0x80
1411                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1412                 /* Invalid designation sequence.  Just ignore.  */
1413                 break;
1414               reg[(c - '(') % 4] = charset;
1415             }
1416           else if (c == '$')
1417             {
1418               /* Designation sequence for a charset of dimension 2.  */
1419               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1420               if (c >= '@' && c <= 'B')
1421                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1422                 reg[0] = charset = iso_charset_table[1][0][c];
1423               else if (c >= '(' && c <= '/')
1424                 {
1425                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1426                   if (c1 < ' ' || c1 >= 0x80
1427                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1428                     /* Invalid designation sequence.  Just ignore.  */
1429                     break;
1430                   reg[(c - '(') % 4] = charset;
1431                 }
1432               else
1433                 /* Invalid designation sequence.  Just ignore.  */
1434                 break;
1435             }
1436           else if (c == 'N' || c == 'O')
1437             {
1438               /* ESC <Fe> for SS2 or SS3.  */
1439               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1440               break;
1441             }
1442           else if (c >= '0' && c <= '4')
1443             {
1444               /* ESC <Fp> for start/end composition.  */
1445               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1446                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1447               else
1448                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1449               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1450                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1451               else
1452                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1453               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1454                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1455               else
1456                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1457               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1458                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1459               else
1460                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1461               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1462                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1463               else
1464                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1465               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1466                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1467               else
1468                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1469               break;
1470             }
1471           else
1472             /* Invalid escape sequence.  Just ignore.  */
1473             break;
1474
1475           /* We found a valid designation sequence for CHARSET.  */
1476           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1477           c = MAKE_CHAR (charset, 0, 0);
1478           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1479             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1480           else
1481             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1482           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1483             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1484           else
1485             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1486           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1487             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1488           else
1489             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1490           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1491             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1492           else
1493             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1494           break;
1495
1496         case ISO_CODE_SO:
1497           if (inhibit_iso_escape_detection)
1498             break;
1499           single_shifting = 0;
1500           if (shift_out == 0
1501               && (reg[1] >= 0
1502                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1503                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1504             {
1505               /* Locking shift out.  */
1506               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1507               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1508             }
1509           break;
1510
1511         case ISO_CODE_SI:
1512           if (inhibit_iso_escape_detection)
1513             break;
1514           single_shifting = 0;
1515           if (shift_out == 1)
1516             {
1517               /* Locking shift in.  */
1518               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1519               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1520             }
1521           break;
1522
1523         case ISO_CODE_CSI:
1524           single_shifting = 0;
1525         case ISO_CODE_SS2:
1526         case ISO_CODE_SS3:
1527           {
1528             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1529
1530             if (inhibit_iso_escape_detection)
1531               break;
1532             if (c != ISO_CODE_CSI)
1533               {
1534                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1535                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1536                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1537                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1538                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1539                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1540                 single_shifting = 1;
1541               }
1542             if (VECTORP (Vlatin_extra_code_table)
1543                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1544               {
1545                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1546                     & CODING_FLAG_ISO_LATIN_EXTRA)
1547                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1548                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1549                     & CODING_FLAG_ISO_LATIN_EXTRA)
1550                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1551               }
1552             mask &= newmask;
1553             mask_found |= newmask;
1554           }
1555           break;
1556
1557         default:
1558           if (c < 0x80)
1559             {
1560               single_shifting = 0;
1561               break;
1562             }
1563           else if (c < 0xA0)
1564             {
1565               single_shifting = 0;
1566               if (VECTORP (Vlatin_extra_code_table)
1567                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1568                 {
1569                   int newmask = 0;
1570
1571                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1572                       & CODING_FLAG_ISO_LATIN_EXTRA)
1573                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1574                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1575                       & CODING_FLAG_ISO_LATIN_EXTRA)
1576                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1577                   mask &= newmask;
1578                   mask_found |= newmask;
1579                 }
1580               else
1581                 return 0;
1582             }
1583           else
1584             {
1585               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1586                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1587               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1588               /* Check the length of succeeding codes of the range
1589                  0xA0..0FF.  If the byte length is odd, we exclude
1590                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1591                  when we are not single shifting.  */
1592               if (!single_shifting
1593                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1594                 {
1595                   int i = 1;
1596
1597                   c = -1;
1598                   while (src < src_end)
1599                     {
1600                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1601                       if (c < 0xA0)
1602                         break;
1603                       i++;
1604                     }
1605
1606                   if (i & 1 && src < src_end)
1607                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1608                   else
1609                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1610                   if (c >= 0)
1611                     /* This means that we have read one extra byte.  */
1612                     goto retry;
1613                 }
1614             }
1615           break;
1616         }
1617     }
1618  label_end_of_loop:
1619   return (mask & mask_found);
1620 }
1621
1622 /* Decode a character of which charset is CHARSET, the 1st position
1623    code is C1, the 2nd position code is C2, and return the decoded
1624    character code.  If the variable `translation_table' is non-nil,
1625    returned the translated code.  */
1626
1627 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1628   (NILP (translation_table)                     \
1629    ? MAKE_CHAR (charset, c1, c2)                \
1630    : translate_char (translation_table, -1, charset, c1, c2))
1631
1632 /* Set designation state into CODING.  */
1633 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1634   do {                                                                     \
1635     int charset, c;                                                        \
1636                                                                            \
1637     if (final_char < '0' || final_char >= 128)                             \
1638       goto label_invalid_code;                                             \
1639     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1640                                  make_number (chars),                      \
1641                                  make_number (final_char));                \
1642     c = MAKE_CHAR (charset, 0, 0);                                         \
1643     if (charset >= 0                                                       \
1644         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1645             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1646       {                                                                    \
1647         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1648             && reg == 0                                                    \
1649             && charset == CHARSET_ASCII)                                   \
1650           {                                                                \
1651             /* We should insert this designation sequence as is so         \
1652                that it is surely written back to a file.  */               \
1653             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1654             goto label_invalid_code;                                       \
1655           }                                                                \
1656         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1657         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1658             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1659           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1660         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1661       }                                                                    \
1662     else                                                                   \
1663       {                                                                    \
1664         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1665         goto label_invalid_code;                                           \
1666       }                                                                    \
1667   } while (0)
1668
1669 /* Allocate a memory block for storing information about compositions.
1670    The block is chained to the already allocated blocks.  */
1671
1672 void
1673 coding_allocate_composition_data (coding, char_offset)
1674      struct coding_system *coding;
1675      int char_offset;
1676 {
1677   struct composition_data *cmp_data
1678     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1679
1680   cmp_data->char_offset = char_offset;
1681   cmp_data->used = 0;
1682   cmp_data->prev = coding->cmp_data;
1683   cmp_data->next = NULL;
1684   if (coding->cmp_data)
1685     coding->cmp_data->next = cmp_data;
1686   coding->cmp_data = cmp_data;
1687   coding->cmp_data_start = 0;
1688 }
1689
1690 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1691    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1692    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1693    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1694    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1695   */
1696
1697 #define DECODE_COMPOSITION_START(c1)                                       \
1698   do {                                                                     \
1699     if (coding->composing == COMPOSITION_DISABLED)                         \
1700       {                                                                    \
1701         *dst++ = ISO_CODE_ESC;                                             \
1702         *dst++ = c1 & 0x7f;                                                \
1703         coding->produced_char += 2;                                        \
1704       }                                                                    \
1705     else if (!COMPOSING_P (coding))                                        \
1706       {                                                                    \
1707         /* This is surely the start of a composition.  We must be sure     \
1708            that coding->cmp_data has enough space to store the             \
1709            information about the composition.  If not, terminate the       \
1710            current decoding loop, allocate one more memory block for       \
1711            coding->cmp_data in the caller, then start the decoding         \
1712            loop again.  We can't allocate memory here directly because     \
1713            it may cause buffer/string relocation.  */                      \
1714         if (!coding->cmp_data                                              \
1715             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1716                 >= COMPOSITION_DATA_SIZE))                                 \
1717           {                                                                \
1718             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1719             goto label_end_of_loop;                                        \
1720           }                                                                \
1721         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1722                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1723                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1724                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1725         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1726                                       coding->composing);                  \
1727         coding->composition_rule_follows = 0;                              \
1728       }                                                                    \
1729     else                                                                   \
1730       {                                                                    \
1731         /* We are already handling a composition.  If the method is        \
1732            the following two, the codes following the current escape       \
1733            sequence are actual characters stored in a buffer.  */          \
1734         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1735             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1736           {                                                                \
1737             coding->composing = COMPOSITION_RELATIVE;                      \
1738             coding->composition_rule_follows = 0;                          \
1739           }                                                                \
1740       }                                                                    \
1741   } while (0)
1742
1743 /* Handle composition end sequence ESC 1.  */
1744
1745 #define DECODE_COMPOSITION_END(c1)                                      \
1746   do {                                                                  \
1747     if (! COMPOSING_P (coding))                                         \
1748       {                                                                 \
1749         *dst++ = ISO_CODE_ESC;                                          \
1750         *dst++ = c1;                                                    \
1751         coding->produced_char += 2;                                     \
1752       }                                                                 \
1753     else                                                                \
1754       {                                                                 \
1755         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1756         coding->composing = COMPOSITION_NO;                             \
1757       }                                                                 \
1758   } while (0)
1759
1760 /* Decode a composition rule from the byte C1 (and maybe one more byte
1761    from SRC) and store one encoded composition rule in
1762    coding->cmp_data.  */
1763
1764 #define DECODE_COMPOSITION_RULE(c1)                                     \
1765   do {                                                                  \
1766     int rule = 0;                                                       \
1767     (c1) -= 32;                                                         \
1768     if (c1 < 81)                /* old format (before ver.21) */        \
1769       {                                                                 \
1770         int gref = (c1) / 9;                                            \
1771         int nref = (c1) % 9;                                            \
1772         if (gref == 4) gref = 10;                                       \
1773         if (nref == 4) nref = 10;                                       \
1774         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1775       }                                                                 \
1776     else if (c1 < 93)           /* new format (after ver.21) */         \
1777       {                                                                 \
1778         ONE_MORE_BYTE (c2);                                             \
1779         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1780       }                                                                 \
1781     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1782     coding->composition_rule_follows = 0;                               \
1783   } while (0)
1784
1785
1786 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1787
1788 static void
1789 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1790      struct coding_system *coding;
1791      unsigned char *source, *destination;
1792      int src_bytes, dst_bytes;
1793 {
1794   unsigned char *src = source;
1795   unsigned char *src_end = source + src_bytes;
1796   unsigned char *dst = destination;
1797   unsigned char *dst_end = destination + dst_bytes;
1798   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1799   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1800   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1801   /* SRC_BASE remembers the start position in source in each loop.
1802      The loop will be exited when there's not enough source code
1803      (within macro ONE_MORE_BYTE), or when there's not enough
1804      destination area to produce a character (within macro
1805      EMIT_CHAR).  */
1806   unsigned char *src_base;
1807   int c, charset;
1808   Lisp_Object translation_table;
1809   Lisp_Object safe_chars;
1810
1811   safe_chars = coding_safe_chars (coding->symbol);
1812
1813   if (NILP (Venable_character_translation))
1814     translation_table = Qnil;
1815   else
1816     {
1817       translation_table = coding->translation_table_for_decode;
1818       if (NILP (translation_table))
1819         translation_table = Vstandard_translation_table_for_decode;
1820     }
1821
1822   coding->result = CODING_FINISH_NORMAL;
1823
1824   while (1)
1825     {
1826       int c1, c2;
1827
1828       src_base = src;
1829       ONE_MORE_BYTE (c1);
1830
1831       /* We produce no character or one character.  */
1832       switch (iso_code_class [c1])
1833         {
1834         case ISO_0x20_or_0x7F:
1835           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1836             {
1837               DECODE_COMPOSITION_RULE (c1);
1838               continue;
1839             }
1840           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1841             {
1842               /* This is SPACE or DEL.  */
1843               charset = CHARSET_ASCII;
1844               break;
1845             }
1846           /* This is a graphic character, we fall down ...  */
1847
1848         case ISO_graphic_plane_0:
1849           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1850             {
1851               DECODE_COMPOSITION_RULE (c1);
1852               continue;
1853             }
1854           charset = charset0;
1855           break;
1856
1857         case ISO_0xA0_or_0xFF:
1858           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1859               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1860             goto label_invalid_code;
1861           /* This is a graphic character, we fall down ... */
1862
1863         case ISO_graphic_plane_1:
1864           if (charset1 < 0)
1865             goto label_invalid_code;
1866           charset = charset1;
1867           break;
1868
1869         case ISO_control_0:
1870           if (COMPOSING_P (coding))
1871             DECODE_COMPOSITION_END ('1');
1872
1873           /* All ISO2022 control characters in this class have the
1874              same representation in Emacs internal format.  */
1875           if (c1 == '\n'
1876               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1877               && (coding->eol_type == CODING_EOL_CR
1878                   || coding->eol_type == CODING_EOL_CRLF))
1879             {
1880               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1881               goto label_end_of_loop;
1882             }
1883           charset = CHARSET_ASCII;
1884           break;
1885
1886         case ISO_control_1:
1887           if (COMPOSING_P (coding))
1888             DECODE_COMPOSITION_END ('1');
1889           goto label_invalid_code;
1890
1891         case ISO_carriage_return:
1892           if (COMPOSING_P (coding))
1893             DECODE_COMPOSITION_END ('1');
1894
1895           if (coding->eol_type == CODING_EOL_CR)
1896             c1 = '\n';
1897           else if (coding->eol_type == CODING_EOL_CRLF)
1898             {
1899               ONE_MORE_BYTE (c1);
1900               if (c1 != ISO_CODE_LF)
1901                 {
1902                   src--;
1903                   c1 = '\r';
1904                 }
1905             }
1906           charset = CHARSET_ASCII;
1907           break;
1908
1909         case ISO_shift_out:
1910           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1911               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1912             goto label_invalid_code;
1913           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1914           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1915           continue;
1916
1917         case ISO_shift_in:
1918           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1919             goto label_invalid_code;
1920           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1921           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1922           continue;
1923
1924         case ISO_single_shift_2_7:
1925         case ISO_single_shift_2:
1926           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1927             goto label_invalid_code;
1928           /* SS2 is handled as an escape sequence of ESC 'N' */
1929           c1 = 'N';
1930           goto label_escape_sequence;
1931
1932         case ISO_single_shift_3:
1933           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1934             goto label_invalid_code;
1935           /* SS2 is handled as an escape sequence of ESC 'O' */
1936           c1 = 'O';
1937           goto label_escape_sequence;
1938
1939         case ISO_control_sequence_introducer:
1940           /* CSI is handled as an escape sequence of ESC '[' ...  */
1941           c1 = '[';
1942           goto label_escape_sequence;
1943
1944         case ISO_escape:
1945           ONE_MORE_BYTE (c1);
1946         label_escape_sequence:
1947           /* Escape sequences handled by Emacs are invocation,
1948              designation, direction specification, and character
1949              composition specification.  */
1950           switch (c1)
1951             {
1952             case '&':           /* revision of following character set */
1953               ONE_MORE_BYTE (c1);
1954               if (!(c1 >= '@' && c1 <= '~'))
1955                 goto label_invalid_code;
1956               ONE_MORE_BYTE (c1);
1957               if (c1 != ISO_CODE_ESC)
1958                 goto label_invalid_code;
1959               ONE_MORE_BYTE (c1);
1960               goto label_escape_sequence;
1961
1962             case '$':           /* designation of 2-byte character set */
1963               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1964                 goto label_invalid_code;
1965               ONE_MORE_BYTE (c1);
1966               if (c1 >= '@' && c1 <= 'B')
1967                 {       /* designation of JISX0208.1978, GB2312.1980,
1968                            or JISX0208.1980 */
1969                   DECODE_DESIGNATION (0, 2, 94, c1);
1970                 }
1971               else if (c1 >= 0x28 && c1 <= 0x2B)
1972                 {       /* designation of DIMENSION2_CHARS94 character set */
1973                   ONE_MORE_BYTE (c2);
1974                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1975                 }
1976               else if (c1 >= 0x2C && c1 <= 0x2F)
1977                 {       /* designation of DIMENSION2_CHARS96 character set */
1978                   ONE_MORE_BYTE (c2);
1979                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1980                 }
1981               else
1982                 goto label_invalid_code;
1983               /* We must update these variables now.  */
1984               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1985               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1986               continue;
1987
1988             case 'n':           /* invocation of locking-shift-2 */
1989               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1990                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1991                 goto label_invalid_code;
1992               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1993               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1994               continue;
1995
1996             case 'o':           /* invocation of locking-shift-3 */
1997               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1998                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1999                 goto label_invalid_code;
2000               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2001               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2002               continue;
2003
2004             case 'N':           /* invocation of single-shift-2 */
2005               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2006                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2007                 goto label_invalid_code;
2008               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2009               ONE_MORE_BYTE (c1);
2010               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2011                 goto label_invalid_code;
2012               break;
2013
2014             case 'O':           /* invocation of single-shift-3 */
2015               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2016                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2017                 goto label_invalid_code;
2018               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2019               ONE_MORE_BYTE (c1);
2020               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2021                 goto label_invalid_code;
2022               break;
2023
2024             case '0': case '2': case '3': case '4': /* start composition */
2025               DECODE_COMPOSITION_START (c1);
2026               continue;
2027
2028             case '1':           /* end composition */
2029               DECODE_COMPOSITION_END (c1);
2030               continue;
2031
2032             case '[':           /* specification of direction */
2033               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2034                 goto label_invalid_code;
2035               /* For the moment, nested direction is not supported.
2036                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2037                  left-to-right, and nonzero means right-to-left.  */
2038               ONE_MORE_BYTE (c1);
2039               switch (c1)
2040                 {
2041                 case ']':       /* end of the current direction */
2042                   coding->mode &= ~CODING_MODE_DIRECTION;
2043
2044                 case '0':       /* end of the current direction */
2045                 case '1':       /* start of left-to-right direction */
2046                   ONE_MORE_BYTE (c1);
2047                   if (c1 == ']')
2048                     coding->mode &= ~CODING_MODE_DIRECTION;
2049                   else
2050                     goto label_invalid_code;
2051                   break;
2052
2053                 case '2':       /* start of right-to-left direction */
2054                   ONE_MORE_BYTE (c1);
2055                   if (c1 == ']')
2056                     coding->mode |= CODING_MODE_DIRECTION;
2057                   else
2058                     goto label_invalid_code;
2059                   break;
2060
2061                 default:
2062                   goto label_invalid_code;
2063                 }
2064               continue;
2065
2066             case '%':
2067               if (COMPOSING_P (coding))
2068                 DECODE_COMPOSITION_END ('1');
2069               ONE_MORE_BYTE (c1);
2070               if (c1 == '/')
2071                 {
2072                   /* CTEXT extended segment:
2073                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2074                      We keep these bytes as is for the moment.
2075                      They may be decoded by post-read-conversion.  */
2076                   int dim, M, L;
2077                   int size, required;
2078                   int produced_chars;
2079
2080                   ONE_MORE_BYTE (dim);
2081                   ONE_MORE_BYTE (M);
2082                   ONE_MORE_BYTE (L);
2083                   size = ((M - 128) * 128) + (L - 128);
2084                   required = 8 + size * 2;
2085                   if (dst + required > (dst_bytes ? dst_end : src))
2086                     goto label_end_of_loop;
2087                   *dst++ = ISO_CODE_ESC;
2088                   *dst++ = '%';
2089                   *dst++ = '/';
2090                   *dst++ = dim;
2091                   produced_chars = 4;
2092                   dst += CHAR_STRING (M, dst), produced_chars++;
2093                   dst += CHAR_STRING (L, dst), produced_chars++;
2094                   while (size-- > 0)
2095                     {
2096                       ONE_MORE_BYTE (c1);
2097                       dst += CHAR_STRING (c1, dst), produced_chars++;
2098                     }
2099                   coding->produced_char += produced_chars;
2100                 }
2101               else if (c1 == 'G')
2102                 {
2103                   unsigned char *d = dst;
2104                   int produced_chars;
2105
2106                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2107                      ESC % G --UTF-8-BYTES-- ESC % @
2108                      We keep these bytes as is for the moment.
2109                      They may be decoded by post-read-conversion.  */
2110                   if (d + 6 > (dst_bytes ? dst_end : src))
2111                     goto label_end_of_loop;
2112                   *d++ = ISO_CODE_ESC;
2113                   *d++ = '%';
2114                   *d++ = 'G';
2115                   produced_chars = 3;
2116                   while (d + 1 < (dst_bytes ? dst_end : src))
2117                     {
2118                       ONE_MORE_BYTE (c1);
2119                       if (c1 == ISO_CODE_ESC
2120                           && src + 1 < src_end
2121                           && src[0] == '%'
2122                           && src[1] == '@')
2123                         break;
2124                       d += CHAR_STRING (c1, d), produced_chars++;
2125                     }
2126                   if (d + 3 > (dst_bytes ? dst_end : src))
2127                     goto label_end_of_loop;
2128                   *d++ = ISO_CODE_ESC;
2129                   *d++ = '%';
2130                   *d++ = '@';
2131                   dst = d;
2132                   coding->produced_char += produced_chars + 3;
2133                 }
2134               else
2135                 goto label_invalid_code;
2136               continue;
2137
2138             default:
2139               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2140                 goto label_invalid_code;
2141               if (c1 >= 0x28 && c1 <= 0x2B)
2142                 {       /* designation of DIMENSION1_CHARS94 character set */
2143                   ONE_MORE_BYTE (c2);
2144                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2145                 }
2146               else if (c1 >= 0x2C && c1 <= 0x2F)
2147                 {       /* designation of DIMENSION1_CHARS96 character set */
2148                   ONE_MORE_BYTE (c2);
2149                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2150                 }
2151               else
2152                 goto label_invalid_code;
2153               /* We must update these variables now.  */
2154               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2155               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2156               continue;
2157             }
2158         }
2159
2160       /* Now we know CHARSET and 1st position code C1 of a character.
2161          Produce a multibyte sequence for that character while getting
2162          2nd position code C2 if necessary.  */
2163       if (CHARSET_DIMENSION (charset) == 2)
2164         {
2165           ONE_MORE_BYTE (c2);
2166           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2167             /* C2 is not in a valid range.  */
2168             goto label_invalid_code;
2169         }
2170       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2171       EMIT_CHAR (c);
2172       continue;
2173
2174     label_invalid_code:
2175       coding->errors++;
2176       if (COMPOSING_P (coding))
2177         DECODE_COMPOSITION_END ('1');
2178       src = src_base;
2179       c = *src++;
2180       EMIT_CHAR (c);
2181     }
2182
2183  label_end_of_loop:
2184   coding->consumed = coding->consumed_char = src_base - source;
2185   coding->produced = dst - destination;
2186   return;
2187 }
2188
2189
2190 /* ISO2022 encoding stuff.  */
2191
2192 /*
2193    It is not enough to say just "ISO2022" on encoding, we have to
2194    specify more details.  In Emacs, each ISO2022 coding system
2195    variant has the following specifications:
2196         1. Initial designation to G0 through G3.
2197         2. Allows short-form designation?
2198         3. ASCII should be designated to G0 before control characters?
2199         4. ASCII should be designated to G0 at end of line?
2200         5. 7-bit environment or 8-bit environment?
2201         6. Use locking-shift?
2202         7. Use Single-shift?
2203    And the following two are only for Japanese:
2204         8. Use ASCII in place of JIS0201-1976-Roman?
2205         9. Use JISX0208-1983 in place of JISX0208-1978?
2206    These specifications are encoded in `coding->flags' as flag bits
2207    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2208    details.
2209 */
2210
2211 /* Produce codes (escape sequence) for designating CHARSET to graphic
2212    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2213    '@', 'A', or 'B' and the coding system CODING allows, produce
2214    designation sequence of short-form.  */
2215
2216 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2217   do {                                                                  \
2218     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2219     char *intermediate_char_94 = "()*+";                                \
2220     char *intermediate_char_96 = ",-./";                                \
2221     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2222                                                                         \
2223     if (revision < 255)                                                 \
2224       {                                                                 \
2225         *dst++ = ISO_CODE_ESC;                                          \
2226         *dst++ = '&';                                                   \
2227         *dst++ = '@' + revision;                                        \
2228       }                                                                 \
2229     *dst++ = ISO_CODE_ESC;                                              \
2230     if (CHARSET_DIMENSION (charset) == 1)                               \
2231       {                                                                 \
2232         if (CHARSET_CHARS (charset) == 94)                              \
2233           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2234         else                                                            \
2235           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2236       }                                                                 \
2237     else                                                                \
2238       {                                                                 \
2239         *dst++ = '$';                                                   \
2240         if (CHARSET_CHARS (charset) == 94)                              \
2241           {                                                             \
2242             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2243                 || reg != 0                                             \
2244                 || final_char < '@' || final_char > 'B')                \
2245               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2246           }                                                             \
2247         else                                                            \
2248           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2249       }                                                                 \
2250     *dst++ = final_char;                                                \
2251     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2252   } while (0)
2253
2254 /* The following two macros produce codes (control character or escape
2255    sequence) for ISO2022 single-shift functions (single-shift-2 and
2256    single-shift-3).  */
2257
2258 #define ENCODE_SINGLE_SHIFT_2                           \
2259   do {                                                  \
2260     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2261       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2262     else                                                \
2263       *dst++ = ISO_CODE_SS2;                            \
2264     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2265   } while (0)
2266
2267 #define ENCODE_SINGLE_SHIFT_3                           \
2268   do {                                                  \
2269     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2270       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2271     else                                                \
2272       *dst++ = ISO_CODE_SS3;                            \
2273     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2274   } while (0)
2275
2276 /* The following four macros produce codes (control character or
2277    escape sequence) for ISO2022 locking-shift functions (shift-in,
2278    shift-out, locking-shift-2, and locking-shift-3).  */
2279
2280 #define ENCODE_SHIFT_IN                         \
2281   do {                                          \
2282     *dst++ = ISO_CODE_SI;                       \
2283     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2284   } while (0)
2285
2286 #define ENCODE_SHIFT_OUT                        \
2287   do {                                          \
2288     *dst++ = ISO_CODE_SO;                       \
2289     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2290   } while (0)
2291
2292 #define ENCODE_LOCKING_SHIFT_2                  \
2293   do {                                          \
2294     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2295     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2296   } while (0)
2297
2298 #define ENCODE_LOCKING_SHIFT_3                  \
2299   do {                                          \
2300     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2301     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2302   } while (0)
2303
2304 /* Produce codes for a DIMENSION1 character whose character set is
2305    CHARSET and whose position-code is C1.  Designation and invocation
2306    sequences are also produced in advance if necessary.  */
2307
2308 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2309   do {                                                                  \
2310     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2311       {                                                                 \
2312         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2313           *dst++ = c1 & 0x7F;                                           \
2314         else                                                            \
2315           *dst++ = c1 | 0x80;                                           \
2316         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2317         break;                                                          \
2318       }                                                                 \
2319     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2320       {                                                                 \
2321         *dst++ = c1 & 0x7F;                                             \
2322         break;                                                          \
2323       }                                                                 \
2324     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2325       {                                                                 \
2326         *dst++ = c1 | 0x80;                                             \
2327         break;                                                          \
2328       }                                                                 \
2329     else                                                                \
2330       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2331          must invoke it, or, at first, designate it to some graphic     \
2332          register.  Then repeat the loop to actually produce the        \
2333          character.  */                                                 \
2334       dst = encode_invocation_designation (charset, coding, dst);       \
2335   } while (1)
2336
2337 /* Produce codes for a DIMENSION2 character whose character set is
2338    CHARSET and whose position-codes are C1 and C2.  Designation and
2339    invocation codes are also produced in advance if necessary.  */
2340
2341 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2342   do {                                                                  \
2343     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2344       {                                                                 \
2345         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2346           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2347         else                                                            \
2348           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2349         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2350         break;                                                          \
2351       }                                                                 \
2352     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2353       {                                                                 \
2354         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2355         break;                                                          \
2356       }                                                                 \
2357     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2358       {                                                                 \
2359         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2360         break;                                                          \
2361       }                                                                 \
2362     else                                                                \
2363       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2364          must invoke it, or, at first, designate it to some graphic     \
2365          register.  Then repeat the loop to actually produce the        \
2366          character.  */                                                 \
2367       dst = encode_invocation_designation (charset, coding, dst);       \
2368   } while (1)
2369
2370 #define ENCODE_ISO_CHARACTER(c)                                 \
2371   do {                                                          \
2372     int charset, c1, c2;                                        \
2373                                                                 \
2374     SPLIT_CHAR (c, charset, c1, c2);                            \
2375     if (CHARSET_DEFINED_P (charset))                            \
2376       {                                                         \
2377         if (CHARSET_DIMENSION (charset) == 1)                   \
2378           {                                                     \
2379             if (charset == CHARSET_ASCII                        \
2380                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2381               charset = charset_latin_jisx0201;                 \
2382             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2383           }                                                     \
2384         else                                                    \
2385           {                                                     \
2386             if (charset == charset_jisx0208                     \
2387                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2388               charset = charset_jisx0208_1978;                  \
2389             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2390           }                                                     \
2391       }                                                         \
2392     else                                                        \
2393       {                                                         \
2394         *dst++ = c1;                                            \
2395         if (c2 >= 0)                                            \
2396           *dst++ = c2;                                          \
2397       }                                                         \
2398   } while (0)
2399
2400
2401 /* Instead of encoding character C, produce one or two `?'s.  */
2402
2403 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2404   do {                                                          \
2405     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2406     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2407       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2408   } while (0)
2409
2410
2411 /* Produce designation and invocation codes at a place pointed by DST
2412    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2413    Return new DST.  */
2414
2415 unsigned char *
2416 encode_invocation_designation (charset, coding, dst)
2417      int charset;
2418      struct coding_system *coding;
2419      unsigned char *dst;
2420 {
2421   int reg;                      /* graphic register number */
2422
2423   /* At first, check designations.  */
2424   for (reg = 0; reg < 4; reg++)
2425     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2426       break;
2427
2428   if (reg >= 4)
2429     {
2430       /* CHARSET is not yet designated to any graphic registers.  */
2431       /* At first check the requested designation.  */
2432       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2433       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2434         /* Since CHARSET requests no special designation, designate it
2435            to graphic register 0.  */
2436         reg = 0;
2437
2438       ENCODE_DESIGNATION (charset, reg, coding);
2439     }
2440
2441   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2442       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2443     {
2444       /* Since the graphic register REG is not invoked to any graphic
2445          planes, invoke it to graphic plane 0.  */
2446       switch (reg)
2447         {
2448         case 0:                 /* graphic register 0 */
2449           ENCODE_SHIFT_IN;
2450           break;
2451
2452         case 1:                 /* graphic register 1 */
2453           ENCODE_SHIFT_OUT;
2454           break;
2455
2456         case 2:                 /* graphic register 2 */
2457           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2458             ENCODE_SINGLE_SHIFT_2;
2459           else
2460             ENCODE_LOCKING_SHIFT_2;
2461           break;
2462
2463         case 3:                 /* graphic register 3 */
2464           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2465             ENCODE_SINGLE_SHIFT_3;
2466           else
2467             ENCODE_LOCKING_SHIFT_3;
2468           break;
2469         }
2470     }
2471
2472   return dst;
2473 }
2474
2475 /* Produce 2-byte codes for encoded composition rule RULE.  */
2476
2477 #define ENCODE_COMPOSITION_RULE(rule)           \
2478   do {                                          \
2479     int gref, nref;                             \
2480     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2481     *dst++ = 32 + 81 + gref;                    \
2482     *dst++ = 32 + nref;                         \
2483   } while (0)
2484
2485 /* Produce codes for indicating the start of a composition sequence
2486    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2487    which specify information about the composition.  See the comment
2488    in coding.h for the format of DATA.  */
2489
2490 #define ENCODE_COMPOSITION_START(coding, data)                          \
2491   do {                                                                  \
2492     coding->composing = data[3];                                        \
2493     *dst++ = ISO_CODE_ESC;                                              \
2494     if (coding->composing == COMPOSITION_RELATIVE)                      \
2495       *dst++ = '0';                                                     \
2496     else                                                                \
2497       {                                                                 \
2498         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2499                   ? '3' : '4');                                         \
2500         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2501         coding->composition_rule_follows = 0;                           \
2502       }                                                                 \
2503   } while (0)
2504
2505 /* Produce codes for indicating the end of the current composition.  */
2506
2507 #define ENCODE_COMPOSITION_END(coding, data)                    \
2508   do {                                                          \
2509     *dst++ = ISO_CODE_ESC;                                      \
2510     *dst++ = '1';                                               \
2511     coding->cmp_data_start += data[0];                          \
2512     coding->composing = COMPOSITION_NO;                         \
2513     if (coding->cmp_data_start == coding->cmp_data->used        \
2514         && coding->cmp_data->next)                              \
2515       {                                                         \
2516         coding->cmp_data = coding->cmp_data->next;              \
2517         coding->cmp_data_start = 0;                             \
2518       }                                                         \
2519   } while (0)
2520
2521 /* Produce composition start sequence ESC 0.  Here, this sequence
2522    doesn't mean the start of a new composition but means that we have
2523    just produced components (alternate chars and composition rules) of
2524    the composition and the actual text follows in SRC.  */
2525
2526 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2527   do {                                          \
2528     *dst++ = ISO_CODE_ESC;                      \
2529     *dst++ = '0';                               \
2530     coding->composing = COMPOSITION_RELATIVE;   \
2531   } while (0)
2532
2533 /* The following three macros produce codes for indicating direction
2534    of text.  */
2535 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2536   do {                                                  \
2537     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2538       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2539     else                                                \
2540       *dst++ = ISO_CODE_CSI;                            \
2541   } while (0)
2542
2543 #define ENCODE_DIRECTION_R2L    \
2544   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2545
2546 #define ENCODE_DIRECTION_L2R    \
2547   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2548
2549 /* Produce codes for designation and invocation to reset the graphic
2550    planes and registers to initial state.  */
2551 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2552   do {                                                                      \
2553     int reg;                                                                \
2554     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2555       ENCODE_SHIFT_IN;                                                      \
2556     for (reg = 0; reg < 4; reg++)                                           \
2557       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2558           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2559               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2560         ENCODE_DESIGNATION                                                  \
2561           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2562   } while (0)
2563
2564 /* Produce designation sequences of charsets in the line started from
2565    SRC to a place pointed by DST, and return updated DST.
2566
2567    If the current block ends before any end-of-line, we may fail to
2568    find all the necessary designations.  */
2569
2570 static unsigned char *
2571 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2572      struct coding_system *coding;
2573      Lisp_Object translation_table;
2574      unsigned char *src, *src_end, *dst;
2575 {
2576   int charset, c, found = 0, reg;
2577   /* Table of charsets to be designated to each graphic register.  */
2578   int r[4];
2579
2580   for (reg = 0; reg < 4; reg++)
2581     r[reg] = -1;
2582
2583   while (found < 4)
2584     {
2585       ONE_MORE_CHAR (c);
2586       if (c == '\n')
2587         break;
2588
2589       charset = CHAR_CHARSET (c);
2590       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2591       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2592         {
2593           found++;
2594           r[reg] = charset;
2595         }
2596     }
2597
2598  label_end_of_loop:
2599   if (found)
2600     {
2601       for (reg = 0; reg < 4; reg++)
2602         if (r[reg] >= 0
2603             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2604           ENCODE_DESIGNATION (r[reg], reg, coding);
2605     }
2606
2607   return dst;
2608 }
2609
2610 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2611
2612 static void
2613 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2614      struct coding_system *coding;
2615      unsigned char *source, *destination;
2616      int src_bytes, dst_bytes;
2617 {
2618   unsigned char *src = source;
2619   unsigned char *src_end = source + src_bytes;
2620   unsigned char *dst = destination;
2621   unsigned char *dst_end = destination + dst_bytes;
2622   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2623      from DST_END to assure overflow checking is necessary only at the
2624      head of loop.  */
2625   unsigned char *adjusted_dst_end = dst_end - 19;
2626   /* SRC_BASE remembers the start position in source in each loop.
2627      The loop will be exited when there's not enough source text to
2628      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2629      there's not enough destination area to produce encoded codes
2630      (within macro EMIT_BYTES).  */
2631   unsigned char *src_base;
2632   int c;
2633   Lisp_Object translation_table;
2634   Lisp_Object safe_chars;
2635
2636   if (coding->flags & CODING_FLAG_ISO_SAFE)
2637     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2638
2639   safe_chars = coding_safe_chars (coding->symbol);
2640
2641   if (NILP (Venable_character_translation))
2642     translation_table = Qnil;
2643   else
2644     {
2645       translation_table = coding->translation_table_for_encode;
2646       if (NILP (translation_table))
2647         translation_table = Vstandard_translation_table_for_encode;
2648     }
2649
2650   coding->consumed_char = 0;
2651   coding->errors = 0;
2652   while (1)
2653     {
2654       src_base = src;
2655
2656       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2657         {
2658           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2659           break;
2660         }
2661
2662       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2663           && CODING_SPEC_ISO_BOL (coding))
2664         {
2665           /* We have to produce designation sequences if any now.  */
2666           dst = encode_designation_at_bol (coding, translation_table,
2667                                            src, src_end, dst);
2668           CODING_SPEC_ISO_BOL (coding) = 0;
2669         }
2670
2671       /* Check composition start and end.  */
2672       if (coding->composing != COMPOSITION_DISABLED
2673           && coding->cmp_data_start < coding->cmp_data->used)
2674         {
2675           struct composition_data *cmp_data = coding->cmp_data;
2676           int *data = cmp_data->data + coding->cmp_data_start;
2677           int this_pos = cmp_data->char_offset + coding->consumed_char;
2678
2679           if (coding->composing == COMPOSITION_RELATIVE)
2680             {
2681               if (this_pos == data[2])
2682                 {
2683                   ENCODE_COMPOSITION_END (coding, data);
2684                   cmp_data = coding->cmp_data;
2685                   data = cmp_data->data + coding->cmp_data_start;
2686                 }
2687             }
2688           else if (COMPOSING_P (coding))
2689             {
2690               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2691               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2692                 /* We have consumed components of the composition.
2693                    What follows in SRC is the composition's base
2694                    text.  */
2695                 ENCODE_COMPOSITION_FAKE_START (coding);
2696               else
2697                 {
2698                   int c = cmp_data->data[coding->cmp_data_index++];
2699                   if (coding->composition_rule_follows)
2700                     {
2701                       ENCODE_COMPOSITION_RULE (c);
2702                       coding->composition_rule_follows = 0;
2703                     }
2704                   else
2705                     {
2706                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2707                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2708                         ENCODE_UNSAFE_CHARACTER (c);
2709                       else
2710                         ENCODE_ISO_CHARACTER (c);
2711                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2712                         coding->composition_rule_follows = 1;
2713                     }
2714                   continue;
2715                 }
2716             }
2717           if (!COMPOSING_P (coding))
2718             {
2719               if (this_pos == data[1])
2720                 {
2721                   ENCODE_COMPOSITION_START (coding, data);
2722                   continue;
2723                 }
2724             }
2725         }
2726
2727       ONE_MORE_CHAR (c);
2728
2729       /* Now encode the character C.  */
2730       if (c < 0x20 || c == 0x7F)
2731         {
2732           if (c == '\r')
2733             {
2734               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2735                 {
2736                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2737                     ENCODE_RESET_PLANE_AND_REGISTER;
2738                   *dst++ = c;
2739                   continue;
2740                 }
2741               /* fall down to treat '\r' as '\n' ...  */
2742               c = '\n';
2743             }
2744           if (c == '\n')
2745             {
2746               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2747                 ENCODE_RESET_PLANE_AND_REGISTER;
2748               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2749                 bcopy (coding->spec.iso2022.initial_designation,
2750                        coding->spec.iso2022.current_designation,
2751                        sizeof coding->spec.iso2022.initial_designation);
2752               if (coding->eol_type == CODING_EOL_LF
2753                   || coding->eol_type == CODING_EOL_UNDECIDED)
2754                 *dst++ = ISO_CODE_LF;
2755               else if (coding->eol_type == CODING_EOL_CRLF)
2756                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2757               else
2758                 *dst++ = ISO_CODE_CR;
2759               CODING_SPEC_ISO_BOL (coding) = 1;
2760             }
2761           else
2762             {
2763               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2764                 ENCODE_RESET_PLANE_AND_REGISTER;
2765               *dst++ = c;
2766             }
2767         }
2768       else if (ASCII_BYTE_P (c))
2769         ENCODE_ISO_CHARACTER (c);
2770       else if (SINGLE_BYTE_CHAR_P (c))
2771         {
2772           *dst++ = c;
2773           coding->errors++;
2774         }
2775       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2776                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2777         ENCODE_UNSAFE_CHARACTER (c);
2778       else
2779         ENCODE_ISO_CHARACTER (c);
2780
2781       coding->consumed_char++;
2782     }
2783
2784  label_end_of_loop:
2785   coding->consumed = src_base - source;
2786   coding->produced = coding->produced_char = dst - destination;
2787 }
2788
2789 \f
2790 /*** 4. SJIS and BIG5 handlers ***/
2791
2792 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2793    quite widely.  So, for the moment, Emacs supports them in the bare
2794    C code.  But, in the future, they may be supported only by CCL.  */
2795
2796 /* SJIS is a coding system encoding three character sets: ASCII, right
2797    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2798    as is.  A character of charset katakana-jisx0201 is encoded by
2799    "position-code + 0x80".  A character of charset japanese-jisx0208
2800    is encoded in 2-byte but two position-codes are divided and shifted
2801    so that it fits in the range below.
2802
2803    --- CODE RANGE of SJIS ---
2804    (character set)      (range)
2805    ASCII                0x00 .. 0x7F
2806    KATAKANA-JISX0201    0xA1 .. 0xDF
2807    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2808             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2809    -------------------------------
2810
2811 */
2812
2813 /* BIG5 is a coding system encoding two character sets: ASCII and
2814    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2815    character set and is encoded in two bytes.
2816
2817    --- CODE RANGE of BIG5 ---
2818    (character set)      (range)
2819    ASCII                0x00 .. 0x7F
2820    Big5 (1st byte)      0xA1 .. 0xFE
2821         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2822    --------------------------
2823
2824    Since the number of characters in Big5 is larger than maximum
2825    characters in Emacs' charset (96x96), it can't be handled as one
2826    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2827    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2828    contains frequently used characters and the latter contains less
2829    frequently used characters.  */
2830
2831 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2832    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2833    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2834    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2835
2836 /* Number of Big5 characters which have the same code in 1st byte.  */
2837 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2838
2839 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2840   do {                                                                  \
2841     unsigned int temp                                                   \
2842       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2843     if (b1 < 0xC9)                                                      \
2844       charset = charset_big5_1;                                         \
2845     else                                                                \
2846       {                                                                 \
2847         charset = charset_big5_2;                                       \
2848         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2849       }                                                                 \
2850     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2851     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2852   } while (0)
2853
2854 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2855   do {                                                                  \
2856     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2857     if (charset == charset_big5_2)                                      \
2858       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2859     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2860     b2 = temp % BIG5_SAME_ROW;                                          \
2861     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2862   } while (0)
2863
2864 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2865    Check if a text is encoded in SJIS.  If it is, return
2866    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2867
2868 static int
2869 detect_coding_sjis (src, src_end, multibytep)
2870      unsigned char *src, *src_end;
2871      int multibytep;
2872 {
2873   int c;
2874   /* Dummy for ONE_MORE_BYTE.  */
2875   struct coding_system dummy_coding;
2876   struct coding_system *coding = &dummy_coding;
2877
2878   while (1)
2879     {
2880       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2881       if (c < 0x80)
2882         continue;
2883       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2884         return 0;
2885       if (c <= 0x9F || c >= 0xE0)
2886         {
2887           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2888           if (c < 0x40 || c == 0x7F || c > 0xFC)
2889             return 0;
2890         }
2891     }
2892  label_end_of_loop:
2893   return CODING_CATEGORY_MASK_SJIS;
2894 }
2895
2896 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2897    Check if a text is encoded in BIG5.  If it is, return
2898    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2899
2900 static int
2901 detect_coding_big5 (src, src_end, multibytep)
2902      unsigned char *src, *src_end;
2903      int multibytep;
2904 {
2905   int c;
2906   /* Dummy for ONE_MORE_BYTE.  */
2907   struct coding_system dummy_coding;
2908   struct coding_system *coding = &dummy_coding;
2909
2910   while (1)
2911     {
2912       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2913       if (c < 0x80)
2914         continue;
2915       if (c < 0xA1 || c > 0xFE)
2916         return 0;
2917       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2918       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2919         return 0;
2920     }
2921  label_end_of_loop:
2922   return CODING_CATEGORY_MASK_BIG5;
2923 }
2924
2925 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2926    Check if a text is encoded in UTF-8.  If it is, return
2927    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2928
2929 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2930 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2931 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2932 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2933 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2934 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2935 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2936
2937 static int
2938 detect_coding_utf_8 (src, src_end, multibytep)
2939      unsigned char *src, *src_end;
2940      int multibytep;
2941 {
2942   unsigned char c;
2943   int seq_maybe_bytes;
2944   /* Dummy for ONE_MORE_BYTE.  */
2945   struct coding_system dummy_coding;
2946   struct coding_system *coding = &dummy_coding;
2947
2948   while (1)
2949     {
2950       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2951       if (UTF_8_1_OCTET_P (c))
2952         continue;
2953       else if (UTF_8_2_OCTET_LEADING_P (c))
2954         seq_maybe_bytes = 1;
2955       else if (UTF_8_3_OCTET_LEADING_P (c))
2956         seq_maybe_bytes = 2;
2957       else if (UTF_8_4_OCTET_LEADING_P (c))
2958         seq_maybe_bytes = 3;
2959       else if (UTF_8_5_OCTET_LEADING_P (c))
2960         seq_maybe_bytes = 4;
2961       else if (UTF_8_6_OCTET_LEADING_P (c))
2962         seq_maybe_bytes = 5;
2963       else
2964         return 0;
2965
2966       do
2967         {
2968           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2969           if (!UTF_8_EXTRA_OCTET_P (c))
2970             return 0;
2971           seq_maybe_bytes--;
2972         }
2973       while (seq_maybe_bytes > 0);
2974     }
2975
2976  label_end_of_loop:
2977   return CODING_CATEGORY_MASK_UTF_8;
2978 }
2979
2980 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2981    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2982    Little Endian (otherwise).  If it is, return
2983    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2984    else return 0.  */
2985
2986 #define UTF_16_INVALID_P(val)   \
2987   (((val) == 0xFFFE)            \
2988    || ((val) == 0xFFFF))
2989
2990 #define UTF_16_HIGH_SURROGATE_P(val) \
2991   (((val) & 0xD800) == 0xD800)
2992
2993 #define UTF_16_LOW_SURROGATE_P(val) \
2994   (((val) & 0xDC00) == 0xDC00)
2995
2996 static int
2997 detect_coding_utf_16 (src, src_end, multibytep)
2998      unsigned char *src, *src_end;
2999      int multibytep;
3000 {
3001   unsigned char c1, c2;
3002   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3003   struct coding_system dummy_coding;
3004   struct coding_system *coding = &dummy_coding;
3005
3006   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3007   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3008
3009   if ((c1 == 0xFF) && (c2 == 0xFE))
3010     return CODING_CATEGORY_MASK_UTF_16_LE;
3011   else if ((c1 == 0xFE) && (c2 == 0xFF))
3012     return CODING_CATEGORY_MASK_UTF_16_BE;
3013
3014  label_end_of_loop:
3015   return 0;
3016 }
3017
3018 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3019    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3020
3021 static void
3022 decode_coding_sjis_big5 (coding, source, destination,
3023                          src_bytes, dst_bytes, sjis_p)
3024      struct coding_system *coding;
3025      unsigned char *source, *destination;
3026      int src_bytes, dst_bytes;
3027      int sjis_p;
3028 {
3029   unsigned char *src = source;
3030   unsigned char *src_end = source + src_bytes;
3031   unsigned char *dst = destination;
3032   unsigned char *dst_end = destination + dst_bytes;
3033   /* SRC_BASE remembers the start position in source in each loop.
3034      The loop will be exited when there's not enough source code
3035      (within macro ONE_MORE_BYTE), or when there's not enough
3036      destination area to produce a character (within macro
3037      EMIT_CHAR).  */
3038   unsigned char *src_base;
3039   Lisp_Object translation_table;
3040
3041   if (NILP (Venable_character_translation))
3042     translation_table = Qnil;
3043   else
3044     {
3045       translation_table = coding->translation_table_for_decode;
3046       if (NILP (translation_table))
3047         translation_table = Vstandard_translation_table_for_decode;
3048     }
3049
3050   coding->produced_char = 0;
3051   while (1)
3052     {
3053       int c, charset, c1, c2;
3054
3055       src_base = src;
3056       ONE_MORE_BYTE (c1);
3057
3058       if (c1 < 0x80)
3059         {
3060           charset = CHARSET_ASCII;
3061           if (c1 < 0x20)
3062             {
3063               if (c1 == '\r')
3064                 {
3065                   if (coding->eol_type == CODING_EOL_CRLF)
3066                     {
3067                       ONE_MORE_BYTE (c2);
3068                       if (c2 == '\n')
3069                         c1 = c2;
3070                       else
3071                         /* To process C2 again, SRC is subtracted by 1.  */
3072                         src--;
3073                     }
3074                   else if (coding->eol_type == CODING_EOL_CR)
3075                     c1 = '\n';
3076                 }
3077               else if (c1 == '\n'
3078                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3079                        && (coding->eol_type == CODING_EOL_CR
3080                            || coding->eol_type == CODING_EOL_CRLF))
3081                 {
3082                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3083                   goto label_end_of_loop;
3084                 }
3085             }
3086         }
3087       else
3088         {
3089           if (sjis_p)
3090             {
3091               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3092                 goto label_invalid_code;
3093               if (c1 <= 0x9F || c1 >= 0xE0)
3094                 {
3095                   /* SJIS -> JISX0208 */
3096                   ONE_MORE_BYTE (c2);
3097                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3098                     goto label_invalid_code;
3099                   DECODE_SJIS (c1, c2, c1, c2);
3100                   charset = charset_jisx0208;
3101                 }
3102               else
3103                 /* SJIS -> JISX0201-Kana */
3104                 charset = charset_katakana_jisx0201;
3105             }
3106           else
3107             {
3108               /* BIG5 -> Big5 */
3109               if (c1 < 0xA0 || c1 > 0xFE)
3110                 goto label_invalid_code;
3111               ONE_MORE_BYTE (c2);
3112               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3113                 goto label_invalid_code;
3114               DECODE_BIG5 (c1, c2, charset, c1, c2);
3115             }
3116         }
3117
3118       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3119       EMIT_CHAR (c);
3120       continue;
3121
3122     label_invalid_code:
3123       coding->errors++;
3124       src = src_base;
3125       c = *src++;
3126       EMIT_CHAR (c);
3127     }
3128
3129  label_end_of_loop:
3130   coding->consumed = coding->consumed_char = src_base - source;
3131   coding->produced = dst - destination;
3132   return;
3133 }
3134
3135 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3136    This function can encode charsets `ascii', `katakana-jisx0201',
3137    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3138    are sure that all these charsets are registered as official charset
3139    (i.e. do not have extended leading-codes).  Characters of other
3140    charsets are produced without any encoding.  If SJIS_P is 1, encode
3141    SJIS text, else encode BIG5 text.  */
3142
3143 static void
3144 encode_coding_sjis_big5 (coding, source, destination,
3145                          src_bytes, dst_bytes, sjis_p)
3146      struct coding_system *coding;
3147      unsigned char *source, *destination;
3148      int src_bytes, dst_bytes;
3149      int sjis_p;
3150 {
3151   unsigned char *src = source;
3152   unsigned char *src_end = source + src_bytes;
3153   unsigned char *dst = destination;
3154   unsigned char *dst_end = destination + dst_bytes;
3155   /* SRC_BASE remembers the start position in source in each loop.
3156      The loop will be exited when there's not enough source text to
3157      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3158      there's not enough destination area to produce encoded codes
3159      (within macro EMIT_BYTES).  */
3160   unsigned char *src_base;
3161   Lisp_Object translation_table;
3162
3163   if (NILP (Venable_character_translation))
3164     translation_table = Qnil;
3165   else
3166     {
3167       translation_table = coding->translation_table_for_encode;
3168       if (NILP (translation_table))
3169         translation_table = Vstandard_translation_table_for_encode;
3170     }
3171
3172   while (1)
3173     {
3174       int c, charset, c1, c2;
3175
3176       src_base = src;
3177       ONE_MORE_CHAR (c);
3178
3179       /* Now encode the character C.  */
3180       if (SINGLE_BYTE_CHAR_P (c))
3181         {
3182           switch (c)
3183             {
3184             case '\r':
3185               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3186                 {
3187                   EMIT_ONE_BYTE (c);
3188                   break;
3189                 }
3190               c = '\n';
3191             case '\n':
3192               if (coding->eol_type == CODING_EOL_CRLF)
3193                 {
3194                   EMIT_TWO_BYTES ('\r', c);
3195                   break;
3196                 }
3197               else if (coding->eol_type == CODING_EOL_CR)
3198                 c = '\r';
3199             default:
3200               EMIT_ONE_BYTE (c);
3201             }
3202         }
3203       else
3204         {
3205           SPLIT_CHAR (c, charset, c1, c2);
3206           if (sjis_p)
3207             {
3208               if (charset == charset_jisx0208
3209                   || charset == charset_jisx0208_1978)
3210                 {
3211                   ENCODE_SJIS (c1, c2, c1, c2);
3212                   EMIT_TWO_BYTES (c1, c2);
3213                 }
3214               else if (charset == charset_katakana_jisx0201)
3215                 EMIT_ONE_BYTE (c1 | 0x80);
3216               else if (charset == charset_latin_jisx0201)
3217                 EMIT_ONE_BYTE (c1);
3218               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3219                 {
3220                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3221                   if (CHARSET_WIDTH (charset) > 1)
3222                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3223                 }
3224               else
3225                 /* There's no way other than producing the internal
3226                    codes as is.  */
3227                 EMIT_BYTES (src_base, src);
3228             }
3229           else
3230             {
3231               if (charset == charset_big5_1 || charset == charset_big5_2)
3232                 {
3233                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3234                   EMIT_TWO_BYTES (c1, c2);
3235                 }
3236               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3237                 {
3238                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3239                   if (CHARSET_WIDTH (charset) > 1)
3240                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3241                 }
3242               else
3243                 /* There's no way other than producing the internal
3244                    codes as is.  */
3245                 EMIT_BYTES (src_base, src);
3246             }
3247         }
3248       coding->consumed_char++;
3249     }
3250
3251  label_end_of_loop:
3252   coding->consumed = src_base - source;
3253   coding->produced = coding->produced_char = dst - destination;
3254 }
3255
3256 \f
3257 /*** 5. CCL handlers ***/
3258
3259 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3260    Check if a text is encoded in a coding system of which
3261    encoder/decoder are written in CCL program.  If it is, return
3262    CODING_CATEGORY_MASK_CCL, else return 0.  */
3263
3264 static int
3265 detect_coding_ccl (src, src_end, multibytep)
3266      unsigned char *src, *src_end;
3267      int multibytep;
3268 {
3269   unsigned char *valid;
3270   int c;
3271   /* Dummy for ONE_MORE_BYTE.  */
3272   struct coding_system dummy_coding;
3273   struct coding_system *coding = &dummy_coding;
3274
3275   /* No coding system is assigned to coding-category-ccl.  */
3276   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3277     return 0;
3278
3279   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3280   while (1)
3281     {
3282       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3283       if (! valid[c])
3284         return 0;
3285     }
3286  label_end_of_loop:
3287   return CODING_CATEGORY_MASK_CCL;
3288 }
3289
3290 \f
3291 /*** 6. End-of-line handlers ***/
3292
3293 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3294
3295 static void
3296 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3297      struct coding_system *coding;
3298      unsigned char *source, *destination;
3299      int src_bytes, dst_bytes;
3300 {
3301   unsigned char *src = source;
3302   unsigned char *dst = destination;
3303   unsigned char *src_end = src + src_bytes;
3304   unsigned char *dst_end = dst + dst_bytes;
3305   Lisp_Object translation_table;
3306   /* SRC_BASE remembers the start position in source in each loop.
3307      The loop will be exited when there's not enough source code
3308      (within macro ONE_MORE_BYTE), or when there's not enough
3309      destination area to produce a character (within macro
3310      EMIT_CHAR).  */
3311   unsigned char *src_base;
3312   int c;
3313
3314   translation_table = Qnil;
3315   switch (coding->eol_type)
3316     {
3317     case CODING_EOL_CRLF:
3318       while (1)
3319         {
3320           src_base = src;
3321           ONE_MORE_BYTE (c);
3322           if (c == '\r')
3323             {
3324               ONE_MORE_BYTE (c);
3325               if (c != '\n')
3326                 {
3327                   src--;
3328                   c = '\r';
3329                 }
3330             }
3331           else if (c == '\n'
3332                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3333             {
3334               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3335               goto label_end_of_loop;
3336             }
3337           EMIT_CHAR (c);
3338         }
3339       break;
3340
3341     case CODING_EOL_CR:
3342       while (1)
3343         {
3344           src_base = src;
3345           ONE_MORE_BYTE (c);
3346           if (c == '\n')
3347             {
3348               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3349                 {
3350                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3351                   goto label_end_of_loop;
3352                 }
3353             }
3354           else if (c == '\r')
3355             c = '\n';
3356           EMIT_CHAR (c);
3357         }
3358       break;
3359
3360     default:                    /* no need for EOL handling */
3361       while (1)
3362         {
3363           src_base = src;
3364           ONE_MORE_BYTE (c);
3365           EMIT_CHAR (c);
3366         }
3367     }
3368
3369  label_end_of_loop:
3370   coding->consumed = coding->consumed_char = src_base - source;
3371   coding->produced = dst - destination;
3372   return;
3373 }
3374
3375 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3376    format of end-of-line according to `coding->eol_type'.  It also
3377    convert multibyte form 8-bit characters to unibyte if
3378    CODING->src_multibyte is nonzero.  If `coding->mode &
3379    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3380    also means end-of-line.  */
3381
3382 static void
3383 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3384      struct coding_system *coding;
3385      const unsigned char *source;
3386      unsigned char *destination;
3387      int src_bytes, dst_bytes;
3388 {
3389   const unsigned char *src = source;
3390   unsigned char *dst = destination;
3391   const unsigned char *src_end = src + src_bytes;
3392   unsigned char *dst_end = dst + dst_bytes;
3393   Lisp_Object translation_table;
3394   /* SRC_BASE remembers the start position in source in each loop.
3395      The loop will be exited when there's not enough source text to
3396      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3397      there's not enough destination area to produce encoded codes
3398      (within macro EMIT_BYTES).  */
3399   const unsigned char *src_base;
3400   unsigned char *tmp;
3401   int c;
3402   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3403
3404   translation_table = Qnil;
3405   if (coding->src_multibyte
3406       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3407     {
3408       src_end--;
3409       src_bytes--;
3410       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3411     }
3412
3413   if (coding->eol_type == CODING_EOL_CRLF)
3414     {
3415       while (src < src_end)
3416         {
3417           src_base = src;
3418           c = *src++;
3419           if (c >= 0x20)
3420             EMIT_ONE_BYTE (c);
3421           else if (c == '\n' || (c == '\r' && selective_display))
3422             EMIT_TWO_BYTES ('\r', '\n');
3423           else
3424             EMIT_ONE_BYTE (c);
3425         }
3426       src_base = src;
3427     label_end_of_loop:
3428       ;
3429     }
3430   else
3431     {
3432       if (!dst_bytes || src_bytes <= dst_bytes)
3433         {
3434           safe_bcopy (src, dst, src_bytes);
3435           src_base = src_end;
3436           dst += src_bytes;
3437         }
3438       else
3439         {
3440           if (coding->src_multibyte
3441               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3442             dst_bytes--;
3443           safe_bcopy (src, dst, dst_bytes);
3444           src_base = src + dst_bytes;
3445           dst = destination + dst_bytes;
3446           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3447         }
3448       if (coding->eol_type == CODING_EOL_CR)
3449         {
3450           for (tmp = destination; tmp < dst; tmp++)
3451             if (*tmp == '\n') *tmp = '\r';
3452         }
3453       else if (selective_display)
3454         {
3455           for (tmp = destination; tmp < dst; tmp++)
3456             if (*tmp == '\r') *tmp = '\n';
3457         }
3458     }
3459   if (coding->src_multibyte)
3460     dst = destination + str_as_unibyte (destination, dst - destination);
3461
3462   coding->consumed = src_base - source;
3463   coding->produced = dst - destination;
3464   coding->produced_char = coding->produced;
3465 }
3466
3467 \f
3468 /*** 7. C library functions ***/
3469
3470 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3471    has a property `coding-system'.  The value of this property is a
3472    vector of length 5 (called the coding-vector).  Among elements of
3473    this vector, the first (element[0]) and the fifth (element[4])
3474    carry important information for decoding/encoding.  Before
3475    decoding/encoding, this information should be set in fields of a
3476    structure of type `coding_system'.
3477
3478    The value of the property `coding-system' can be a symbol of another
3479    subsidiary coding-system.  In that case, Emacs gets coding-vector
3480    from that symbol.
3481
3482    `element[0]' contains information to be set in `coding->type'.  The
3483    value and its meaning is as follows:
3484
3485    0 -- coding_type_emacs_mule
3486    1 -- coding_type_sjis
3487    2 -- coding_type_iso2022
3488    3 -- coding_type_big5
3489    4 -- coding_type_ccl encoder/decoder written in CCL
3490    nil -- coding_type_no_conversion
3491    t -- coding_type_undecided (automatic conversion on decoding,
3492                                no-conversion on encoding)
3493
3494    `element[4]' contains information to be set in `coding->flags' and
3495    `coding->spec'.  The meaning varies by `coding->type'.
3496
3497    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3498    of length 32 (of which the first 13 sub-elements are used now).
3499    Meanings of these sub-elements are:
3500
3501    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3502         If the value is an integer of valid charset, the charset is
3503         assumed to be designated to graphic register N initially.
3504
3505         If the value is minus, it is a minus value of charset which
3506         reserves graphic register N, which means that the charset is
3507         not designated initially but should be designated to graphic
3508         register N just before encoding a character in that charset.
3509
3510         If the value is nil, graphic register N is never used on
3511         encoding.
3512
3513    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3514         Each value takes t or nil.  See the section ISO2022 of
3515         `coding.h' for more information.
3516
3517    If `coding->type' is `coding_type_big5', element[4] is t to denote
3518    BIG5-ETen or nil to denote BIG5-HKU.
3519
3520    If `coding->type' takes the other value, element[4] is ignored.
3521
3522    Emacs Lisp's coding systems also carry information about format of
3523    end-of-line in a value of property `eol-type'.  If the value is
3524    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3525    means CODING_EOL_CR.  If it is not integer, it should be a vector
3526    of subsidiary coding systems of which property `eol-type' has one
3527    of the above values.
3528
3529 */
3530
3531 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3532    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3533    is setup so that no conversion is necessary and return -1, else
3534    return 0.  */
3535
3536 int
3537 setup_coding_system (coding_system, coding)
3538      Lisp_Object coding_system;
3539      struct coding_system *coding;
3540 {
3541   Lisp_Object coding_spec, coding_type, eol_type, plist;
3542   Lisp_Object val;
3543
3544   /* At first, zero clear all members.  */
3545   bzero (coding, sizeof (struct coding_system));
3546
3547   /* Initialize some fields required for all kinds of coding systems.  */
3548   coding->symbol = coding_system;
3549   coding->heading_ascii = -1;
3550   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3551   coding->composing = COMPOSITION_DISABLED;
3552   coding->cmp_data = NULL;
3553
3554   if (NILP (coding_system))
3555     goto label_invalid_coding_system;
3556
3557   coding_spec = Fget (coding_system, Qcoding_system);
3558
3559   if (!VECTORP (coding_spec)
3560       || XVECTOR (coding_spec)->size != 5
3561       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3562     goto label_invalid_coding_system;
3563
3564   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3565   if (VECTORP (eol_type))
3566     {
3567       coding->eol_type = CODING_EOL_UNDECIDED;
3568       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3569     }
3570   else if (XFASTINT (eol_type) == 1)
3571     {
3572       coding->eol_type = CODING_EOL_CRLF;
3573       coding->common_flags
3574         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3575     }
3576   else if (XFASTINT (eol_type) == 2)
3577     {
3578       coding->eol_type = CODING_EOL_CR;
3579       coding->common_flags
3580         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3581     }
3582   else
3583     coding->eol_type = CODING_EOL_LF;
3584
3585   coding_type = XVECTOR (coding_spec)->contents[0];
3586   /* Try short cut.  */
3587   if (SYMBOLP (coding_type))
3588     {
3589       if (EQ (coding_type, Qt))
3590         {
3591           coding->type = coding_type_undecided;
3592           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3593         }
3594       else
3595         coding->type = coding_type_no_conversion;
3596       /* Initialize this member.  Any thing other than
3597          CODING_CATEGORY_IDX_UTF_16_BE and
3598          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3599          special treatment in detect_eol.  */
3600       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3601
3602       return 0;
3603     }
3604
3605   /* Get values of coding system properties:
3606      `post-read-conversion', `pre-write-conversion',
3607      `translation-table-for-decode', `translation-table-for-encode'.  */
3608   plist = XVECTOR (coding_spec)->contents[3];
3609   /* Pre & post conversion functions should be disabled if
3610      inhibit_eol_conversion is nonzero.  This is the case that a code
3611      conversion function is called while those functions are running.  */
3612   if (! inhibit_pre_post_conversion)
3613     {
3614       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3615       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3616     }
3617   val = Fplist_get (plist, Qtranslation_table_for_decode);
3618   if (SYMBOLP (val))
3619     val = Fget (val, Qtranslation_table_for_decode);
3620   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3621   val = Fplist_get (plist, Qtranslation_table_for_encode);
3622   if (SYMBOLP (val))
3623     val = Fget (val, Qtranslation_table_for_encode);
3624   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3625   val = Fplist_get (plist, Qcoding_category);
3626   if (!NILP (val))
3627     {
3628       val = Fget (val, Qcoding_category_index);
3629       if (INTEGERP (val))
3630         coding->category_idx = XINT (val);
3631       else
3632         goto label_invalid_coding_system;
3633     }
3634   else
3635     goto label_invalid_coding_system;
3636
3637   /* If the coding system has non-nil `composition' property, enable
3638      composition handling.  */
3639   val = Fplist_get (plist, Qcomposition);
3640   if (!NILP (val))
3641     coding->composing = COMPOSITION_NO;
3642
3643   switch (XFASTINT (coding_type))
3644     {
3645     case 0:
3646       coding->type = coding_type_emacs_mule;
3647       coding->common_flags
3648         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3649       if (!NILP (coding->post_read_conversion))
3650         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3651       if (!NILP (coding->pre_write_conversion))
3652         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3653       break;
3654
3655     case 1:
3656       coding->type = coding_type_sjis;
3657       coding->common_flags
3658         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3659       break;
3660
3661     case 2:
3662       coding->type = coding_type_iso2022;
3663       coding->common_flags
3664         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3665       {
3666         Lisp_Object val, temp;
3667         Lisp_Object *flags;
3668         int i, charset, reg_bits = 0;
3669
3670         val = XVECTOR (coding_spec)->contents[4];
3671
3672         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3673           goto label_invalid_coding_system;
3674
3675         flags = XVECTOR (val)->contents;
3676         coding->flags
3677           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3678              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3679              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3680              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3681              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3682              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3683              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3684              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3685              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3686              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3687              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3688              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3689              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3690              );
3691
3692         /* Invoke graphic register 0 to plane 0.  */
3693         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3694         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3695         CODING_SPEC_ISO_INVOCATION (coding, 1)
3696           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3697         /* Not single shifting at first.  */
3698         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3699         /* Beginning of buffer should also be regarded as bol. */
3700         CODING_SPEC_ISO_BOL (coding) = 1;
3701
3702         for (charset = 0; charset <= MAX_CHARSET; charset++)
3703           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3704         val = Vcharset_revision_alist;
3705         while (CONSP (val))
3706           {
3707             charset = get_charset_id (Fcar_safe (XCAR (val)));
3708             if (charset >= 0
3709                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3710                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3711               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3712             val = XCDR (val);
3713           }
3714
3715         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3716            FLAGS[REG] can be one of below:
3717                 integer CHARSET: CHARSET occupies register I,
3718                 t: designate nothing to REG initially, but can be used
3719                   by any charsets,
3720                 list of integer, nil, or t: designate the first
3721                   element (if integer) to REG initially, the remaining
3722                   elements (if integer) is designated to REG on request,
3723                   if an element is t, REG can be used by any charsets,
3724                 nil: REG is never used.  */
3725         for (charset = 0; charset <= MAX_CHARSET; charset++)
3726           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3727             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3728         for (i = 0; i < 4; i++)
3729           {
3730             if ((INTEGERP (flags[i])
3731                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3732                 || (charset = get_charset_id (flags[i])) >= 0)
3733               {
3734                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3735                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3736               }
3737             else if (EQ (flags[i], Qt))
3738               {
3739                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3740                 reg_bits |= 1 << i;
3741                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3742               }
3743             else if (CONSP (flags[i]))
3744               {
3745                 Lisp_Object tail;
3746                 tail = flags[i];
3747
3748                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3749                 if ((INTEGERP (XCAR (tail))
3750                      && (charset = XINT (XCAR (tail)),
3751                          CHARSET_VALID_P (charset)))
3752                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3753                   {
3754                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3755                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3756                   }
3757                 else
3758                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3759                 tail = XCDR (tail);
3760                 while (CONSP (tail))
3761                   {
3762                     if ((INTEGERP (XCAR (tail))
3763                          && (charset = XINT (XCAR (tail)),
3764                              CHARSET_VALID_P (charset)))
3765                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3766                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3767                         = i;
3768                     else if (EQ (XCAR (tail), Qt))
3769                       reg_bits |= 1 << i;
3770                     tail = XCDR (tail);
3771                   }
3772               }
3773             else
3774               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3775
3776             CODING_SPEC_ISO_DESIGNATION (coding, i)
3777               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3778           }
3779
3780         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3781           {
3782             /* REG 1 can be used only by locking shift in 7-bit env.  */
3783             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3784               reg_bits &= ~2;
3785             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3786               /* Without any shifting, only REG 0 and 1 can be used.  */
3787               reg_bits &= 3;
3788           }
3789
3790         if (reg_bits)
3791           for (charset = 0; charset <= MAX_CHARSET; charset++)
3792             {
3793               if (CHARSET_DEFINED_P (charset)
3794                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3795                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3796                 {
3797                   /* There exist some default graphic registers to be
3798                      used by CHARSET.  */
3799
3800                   /* We had better avoid designating a charset of
3801                      CHARS96 to REG 0 as far as possible.  */
3802                   if (CHARSET_CHARS (charset) == 96)
3803                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3804                       = (reg_bits & 2
3805                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3806                   else
3807                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3808                       = (reg_bits & 1
3809                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3810                 }
3811             }
3812       }
3813       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3814       coding->spec.iso2022.last_invalid_designation_register = -1;
3815       break;
3816
3817     case 3:
3818       coding->type = coding_type_big5;
3819       coding->common_flags
3820         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3821       coding->flags
3822         = (NILP (XVECTOR (coding_spec)->contents[4])
3823            ? CODING_FLAG_BIG5_HKU
3824            : CODING_FLAG_BIG5_ETEN);
3825       break;
3826
3827     case 4:
3828       coding->type = coding_type_ccl;
3829       coding->common_flags
3830         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3831       {
3832         val = XVECTOR (coding_spec)->contents[4];
3833         if (! CONSP (val)
3834             || setup_ccl_program (&(coding->spec.ccl.decoder),
3835                                   XCAR (val)) < 0
3836             || setup_ccl_program (&(coding->spec.ccl.encoder),
3837                                   XCDR (val)) < 0)
3838           goto label_invalid_coding_system;
3839
3840         bzero (coding->spec.ccl.valid_codes, 256);
3841         val = Fplist_get (plist, Qvalid_codes);
3842         if (CONSP (val))
3843           {
3844             Lisp_Object this;
3845
3846             for (; CONSP (val); val = XCDR (val))
3847               {
3848                 this = XCAR (val);
3849                 if (INTEGERP (this)
3850                     && XINT (this) >= 0 && XINT (this) < 256)
3851                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3852                 else if (CONSP (this)
3853                          && INTEGERP (XCAR (this))
3854                          && INTEGERP (XCDR (this)))
3855                   {
3856                     int start = XINT (XCAR (this));
3857                     int end = XINT (XCDR (this));
3858
3859                     if (start >= 0 && start <= end && end < 256)
3860                       while (start <= end)
3861                         coding->spec.ccl.valid_codes[start++] = 1;
3862                   }
3863               }
3864           }
3865       }
3866       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3867       coding->spec.ccl.cr_carryover = 0;
3868       coding->spec.ccl.eight_bit_carryover[0] = 0;
3869       break;
3870
3871     case 5:
3872       coding->type = coding_type_raw_text;
3873       break;
3874
3875     default:
3876       goto label_invalid_coding_system;
3877     }
3878   return 0;
3879
3880  label_invalid_coding_system:
3881   coding->type = coding_type_no_conversion;
3882   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3883   coding->common_flags = 0;
3884   coding->eol_type = CODING_EOL_LF;
3885   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3886   return -1;
3887 }
3888
3889 /* Free memory blocks allocated for storing composition information.  */
3890
3891 void
3892 coding_free_composition_data (coding)
3893      struct coding_system *coding;
3894 {
3895   struct composition_data *cmp_data = coding->cmp_data, *next;
3896
3897   if (!cmp_data)
3898     return;
3899   /* Memory blocks are chained.  At first, rewind to the first, then,
3900      free blocks one by one.  */
3901   while (cmp_data->prev)
3902     cmp_data = cmp_data->prev;
3903   while (cmp_data)
3904     {
3905       next = cmp_data->next;
3906       xfree (cmp_data);
3907       cmp_data = next;
3908     }
3909   coding->cmp_data = NULL;
3910 }
3911
3912 /* Set `char_offset' member of all memory blocks pointed by
3913    coding->cmp_data to POS.  */
3914
3915 void
3916 coding_adjust_composition_offset (coding, pos)
3917      struct coding_system *coding;
3918      int pos;
3919 {
3920   struct composition_data *cmp_data;
3921
3922   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3923     cmp_data->char_offset = pos;
3924 }
3925
3926 /* Setup raw-text or one of its subsidiaries in the structure
3927    coding_system CODING according to the already setup value eol_type
3928    in CODING.  CODING should be setup for some coding system in
3929    advance.  */
3930
3931 void
3932 setup_raw_text_coding_system (coding)
3933      struct coding_system *coding;
3934 {
3935   if (coding->type != coding_type_raw_text)
3936     {
3937       coding->symbol = Qraw_text;
3938       coding->type = coding_type_raw_text;
3939       if (coding->eol_type != CODING_EOL_UNDECIDED)
3940         {
3941           Lisp_Object subsidiaries;
3942           subsidiaries = Fget (Qraw_text, Qeol_type);
3943
3944           if (VECTORP (subsidiaries)
3945               && XVECTOR (subsidiaries)->size == 3)
3946             coding->symbol
3947               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3948         }
3949       setup_coding_system (coding->symbol, coding);
3950     }
3951   return;
3952 }
3953
3954 /* Emacs has a mechanism to automatically detect a coding system if it
3955    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3956    it's impossible to distinguish some coding systems accurately
3957    because they use the same range of codes.  So, at first, coding
3958    systems are categorized into 7, those are:
3959
3960    o coding-category-emacs-mule
3961
3962         The category for a coding system which has the same code range
3963         as Emacs' internal format.  Assigned the coding-system (Lisp
3964         symbol) `emacs-mule' by default.
3965
3966    o coding-category-sjis
3967
3968         The category for a coding system which has the same code range
3969         as SJIS.  Assigned the coding-system (Lisp
3970         symbol) `japanese-shift-jis' by default.
3971
3972    o coding-category-iso-7
3973
3974         The category for a coding system which has the same code range
3975         as ISO2022 of 7-bit environment.  This doesn't use any locking
3976         shift and single shift functions.  This can encode/decode all
3977         charsets.  Assigned the coding-system (Lisp symbol)
3978         `iso-2022-7bit' by default.
3979
3980    o coding-category-iso-7-tight
3981
3982         Same as coding-category-iso-7 except that this can
3983         encode/decode only the specified charsets.
3984
3985    o coding-category-iso-8-1
3986
3987         The category for a coding system which has the same code range
3988         as ISO2022 of 8-bit environment and graphic plane 1 used only
3989         for DIMENSION1 charset.  This doesn't use any locking shift
3990         and single shift functions.  Assigned the coding-system (Lisp
3991         symbol) `iso-latin-1' by default.
3992
3993    o coding-category-iso-8-2
3994
3995         The category for a coding system which has the same code range
3996         as ISO2022 of 8-bit environment and graphic plane 1 used only
3997         for DIMENSION2 charset.  This doesn't use any locking shift
3998         and single shift functions.  Assigned the coding-system (Lisp
3999         symbol) `japanese-iso-8bit' by default.
4000
4001    o coding-category-iso-7-else
4002
4003         The category for a coding system which has the same code range
4004         as ISO2022 of 7-bit environment but uses locking shift or
4005         single shift functions.  Assigned the coding-system (Lisp
4006         symbol) `iso-2022-7bit-lock' by default.
4007
4008    o coding-category-iso-8-else
4009
4010         The category for a coding system which has the same code range
4011         as ISO2022 of 8-bit environment but uses locking shift or
4012         single shift functions.  Assigned the coding-system (Lisp
4013         symbol) `iso-2022-8bit-ss2' by default.
4014
4015    o coding-category-big5
4016
4017         The category for a coding system which has the same code range
4018         as BIG5.  Assigned the coding-system (Lisp symbol)
4019         `cn-big5' by default.
4020
4021    o coding-category-utf-8
4022
4023         The category for a coding system which has the same code range
4024         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
4025         symbol) `utf-8' by default.
4026
4027    o coding-category-utf-16-be
4028
4029         The category for a coding system in which a text has an
4030         Unicode signature (cf. Unicode Standard) in the order of BIG
4031         endian at the head.  Assigned the coding-system (Lisp symbol)
4032         `utf-16-be' by default.
4033
4034    o coding-category-utf-16-le
4035
4036         The category for a coding system in which a text has an
4037         Unicode signature (cf. Unicode Standard) in the order of
4038         LITTLE endian at the head.  Assigned the coding-system (Lisp
4039         symbol) `utf-16-le' by default.
4040
4041    o coding-category-ccl
4042
4043         The category for a coding system of which encoder/decoder is
4044         written in CCL programs.  The default value is nil, i.e., no
4045         coding system is assigned.
4046
4047    o coding-category-binary
4048
4049         The category for a coding system not categorized in any of the
4050         above.  Assigned the coding-system (Lisp symbol)
4051         `no-conversion' by default.
4052
4053    Each of them is a Lisp symbol and the value is an actual
4054    `coding-system' (this is also a Lisp symbol) assigned by a user.
4055    What Emacs does actually is to detect a category of coding system.
4056    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4057    decide a single possible category, it selects a category of the
4058    highest priority.  Priorities of categories are also specified by a
4059    user in a Lisp variable `coding-category-list'.
4060
4061 */
4062
4063 static
4064 int ascii_skip_code[256];
4065
4066 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4067    If it detects possible coding systems, return an integer in which
4068    appropriate flag bits are set.  Flag bits are defined by macros
4069    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4070    it should point the table `coding_priorities'.  In that case, only
4071    the flag bit for a coding system of the highest priority is set in
4072    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4073    range 0x80..0x9F are in multibyte form.
4074
4075    How many ASCII characters are at the head is returned as *SKIP.  */
4076
4077 static int
4078 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4079      unsigned char *source;
4080      int src_bytes, *priorities, *skip;
4081      int multibytep;
4082 {
4083   register unsigned char c;
4084   unsigned char *src = source, *src_end = source + src_bytes;
4085   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4086   int i;
4087
4088   /* At first, skip all ASCII characters and control characters except
4089      for three ISO2022 specific control characters.  */
4090   ascii_skip_code[ISO_CODE_SO] = 0;
4091   ascii_skip_code[ISO_CODE_SI] = 0;
4092   ascii_skip_code[ISO_CODE_ESC] = 0;
4093
4094  label_loop_detect_coding:
4095   while (src < src_end && ascii_skip_code[*src]) src++;
4096   *skip = src - source;
4097
4098   if (src >= src_end)
4099     /* We found nothing other than ASCII.  There's nothing to do.  */
4100     return 0;
4101
4102   c = *src;
4103   /* The text seems to be encoded in some multilingual coding system.
4104      Now, try to find in which coding system the text is encoded.  */
4105   if (c < 0x80)
4106     {
4107       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4108       /* C is an ISO2022 specific control code of C0.  */
4109       mask = detect_coding_iso2022 (src, src_end, multibytep);
4110       if (mask == 0)
4111         {
4112           /* No valid ISO2022 code follows C.  Try again.  */
4113           src++;
4114           if (c == ISO_CODE_ESC)
4115             ascii_skip_code[ISO_CODE_ESC] = 1;
4116           else
4117             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4118           goto label_loop_detect_coding;
4119         }
4120       if (priorities)
4121         {
4122           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4123             {
4124               if (mask & priorities[i])
4125                 return priorities[i];
4126             }
4127           return CODING_CATEGORY_MASK_RAW_TEXT;
4128         }
4129     }
4130   else
4131     {
4132       int try;
4133
4134       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4135         c = src[1] - 0x20;
4136
4137       if (c < 0xA0)
4138         {
4139           /* C is the first byte of SJIS character code,
4140              or a leading-code of Emacs' internal format (emacs-mule),
4141              or the first byte of UTF-16.  */
4142           try = (CODING_CATEGORY_MASK_SJIS
4143                   | CODING_CATEGORY_MASK_EMACS_MULE
4144                   | CODING_CATEGORY_MASK_UTF_16_BE
4145                   | CODING_CATEGORY_MASK_UTF_16_LE);
4146
4147           /* Or, if C is a special latin extra code,
4148              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4149              or is an ISO2022 control-sequence-introducer (CSI),
4150              we should also consider the possibility of ISO2022 codings.  */
4151           if ((VECTORP (Vlatin_extra_code_table)
4152                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4153               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4154               || (c == ISO_CODE_CSI
4155                   && (src < src_end
4156                       && (*src == ']'
4157                           || ((*src == '0' || *src == '1' || *src == '2')
4158                               && src + 1 < src_end
4159                               && src[1] == ']')))))
4160             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4161                      | CODING_CATEGORY_MASK_ISO_8BIT);
4162         }
4163       else
4164         /* C is a character of ISO2022 in graphic plane right,
4165            or a SJIS's 1-byte character code (i.e. JISX0201),
4166            or the first byte of BIG5's 2-byte code,
4167            or the first byte of UTF-8/16.  */
4168         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4169                 | CODING_CATEGORY_MASK_ISO_8BIT
4170                 | CODING_CATEGORY_MASK_SJIS
4171                 | CODING_CATEGORY_MASK_BIG5
4172                 | CODING_CATEGORY_MASK_UTF_8
4173                 | CODING_CATEGORY_MASK_UTF_16_BE
4174                 | CODING_CATEGORY_MASK_UTF_16_LE);
4175
4176       /* Or, we may have to consider the possibility of CCL.  */
4177       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4178           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4179               ->spec.ccl.valid_codes)[c])
4180         try |= CODING_CATEGORY_MASK_CCL;
4181
4182       mask = 0;
4183       utf16_examined_p = iso2022_examined_p = 0;
4184       if (priorities)
4185         {
4186           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4187             {
4188               if (!iso2022_examined_p
4189                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4190                 {
4191                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4192                   iso2022_examined_p = 1;
4193                 }
4194               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4195                 mask |= detect_coding_sjis (src, src_end, multibytep);
4196               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4197                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4198               else if (!utf16_examined_p
4199                        && (priorities[i] & try &
4200                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4201                 {
4202                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4203                   utf16_examined_p = 1;
4204                 }
4205               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4206                 mask |= detect_coding_big5 (src, src_end, multibytep);
4207               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4208                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4209               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4210                 mask |= detect_coding_ccl (src, src_end, multibytep);
4211               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4212                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4213               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4214                 mask |= CODING_CATEGORY_MASK_BINARY;
4215               if (mask & priorities[i])
4216                 return priorities[i];
4217             }
4218           return CODING_CATEGORY_MASK_RAW_TEXT;
4219         }
4220       if (try & CODING_CATEGORY_MASK_ISO)
4221         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4222       if (try & CODING_CATEGORY_MASK_SJIS)
4223         mask |= detect_coding_sjis (src, src_end, multibytep);
4224       if (try & CODING_CATEGORY_MASK_BIG5)
4225         mask |= detect_coding_big5 (src, src_end, multibytep);
4226       if (try & CODING_CATEGORY_MASK_UTF_8)
4227         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4228       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4229         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4230       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4231         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4232       if (try & CODING_CATEGORY_MASK_CCL)
4233         mask |= detect_coding_ccl (src, src_end, multibytep);
4234     }
4235   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4236 }
4237
4238 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4239    The information of the detected coding system is set in CODING.  */
4240
4241 void
4242 detect_coding (coding, src, src_bytes)
4243      struct coding_system *coding;
4244      const unsigned char *src;
4245      int src_bytes;
4246 {
4247   unsigned int idx;
4248   int skip, mask;
4249   Lisp_Object val;
4250
4251   val = Vcoding_category_list;
4252   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4253                              coding->src_multibyte);
4254   coding->heading_ascii = skip;
4255
4256   if (!mask) return;
4257
4258   /* We found a single coding system of the highest priority in MASK.  */
4259   idx = 0;
4260   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4261   if (! mask)
4262     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4263
4264   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4265
4266   if (coding->eol_type != CODING_EOL_UNDECIDED)
4267     {
4268       Lisp_Object tmp;
4269
4270       tmp = Fget (val, Qeol_type);
4271       if (VECTORP (tmp))
4272         val = XVECTOR (tmp)->contents[coding->eol_type];
4273     }
4274
4275   /* Setup this new coding system while preserving some slots.  */
4276   {
4277     int src_multibyte = coding->src_multibyte;
4278     int dst_multibyte = coding->dst_multibyte;
4279
4280     setup_coding_system (val, coding);
4281     coding->src_multibyte = src_multibyte;
4282     coding->dst_multibyte = dst_multibyte;
4283     coding->heading_ascii = skip;
4284   }
4285 }
4286
4287 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4288    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4289    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4290
4291    How many non-eol characters are at the head is returned as *SKIP.  */
4292
4293 #define MAX_EOL_CHECK_COUNT 3
4294
4295 static int
4296 detect_eol_type (source, src_bytes, skip)
4297      unsigned char *source;
4298      int src_bytes, *skip;
4299 {
4300   unsigned char *src = source, *src_end = src + src_bytes;
4301   unsigned char c;
4302   int total = 0;                /* How many end-of-lines are found so far.  */
4303   int eol_type = CODING_EOL_UNDECIDED;
4304   int this_eol_type;
4305
4306   *skip = 0;
4307
4308   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4309     {
4310       c = *src++;
4311       if (c == '\n' || c == '\r')
4312         {
4313           if (*skip == 0)
4314             *skip = src - 1 - source;
4315           total++;
4316           if (c == '\n')
4317             this_eol_type = CODING_EOL_LF;
4318           else if (src >= src_end || *src != '\n')
4319             this_eol_type = CODING_EOL_CR;
4320           else
4321             this_eol_type = CODING_EOL_CRLF, src++;
4322
4323           if (eol_type == CODING_EOL_UNDECIDED)
4324             /* This is the first end-of-line.  */
4325             eol_type = this_eol_type;
4326           else if (eol_type != this_eol_type)
4327             {
4328               /* The found type is different from what found before.  */
4329               eol_type = CODING_EOL_INCONSISTENT;
4330               break;
4331             }
4332         }
4333     }
4334
4335   if (*skip == 0)
4336     *skip = src_end - source;
4337   return eol_type;
4338 }
4339
4340 /* Like detect_eol_type, but detect EOL type in 2-octet
4341    big-endian/little-endian format for coding systems utf-16-be and
4342    utf-16-le.  */
4343
4344 static int
4345 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4346      unsigned char *source;
4347      int src_bytes, *skip, big_endian_p;
4348 {
4349   unsigned char *src = source, *src_end = src + src_bytes;
4350   unsigned int c1, c2;
4351   int total = 0;                /* How many end-of-lines are found so far.  */
4352   int eol_type = CODING_EOL_UNDECIDED;
4353   int this_eol_type;
4354   int msb, lsb;
4355
4356   if (big_endian_p)
4357     msb = 0, lsb = 1;
4358   else
4359     msb = 1, lsb = 0;
4360
4361   *skip = 0;
4362
4363   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4364     {
4365       c1 = (src[msb] << 8) | (src[lsb]);
4366       src += 2;
4367
4368       if (c1 == '\n' || c1 == '\r')
4369         {
4370           if (*skip == 0)
4371             *skip = src - 2 - source;
4372           total++;
4373           if (c1 == '\n')
4374             {
4375               this_eol_type = CODING_EOL_LF;
4376             }
4377           else
4378             {
4379               if ((src + 1) >= src_end)
4380                 {
4381                   this_eol_type = CODING_EOL_CR;
4382                 }
4383               else
4384                 {
4385                   c2 = (src[msb] << 8) | (src[lsb]);
4386                   if (c2 == '\n')
4387                     this_eol_type = CODING_EOL_CRLF, src += 2;
4388                   else
4389                     this_eol_type = CODING_EOL_CR;
4390                 }
4391             }
4392
4393           if (eol_type == CODING_EOL_UNDECIDED)
4394             /* This is the first end-of-line.  */
4395             eol_type = this_eol_type;
4396           else if (eol_type != this_eol_type)
4397             {
4398               /* The found type is different from what found before.  */
4399               eol_type = CODING_EOL_INCONSISTENT;
4400               break;
4401             }
4402         }
4403     }
4404
4405   if (*skip == 0)
4406     *skip = src_end - source;
4407   return eol_type;
4408 }
4409
4410 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4411    is encoded.  If it detects an appropriate format of end-of-line, it
4412    sets the information in *CODING.  */
4413
4414 void
4415 detect_eol (coding, src, src_bytes)
4416      struct coding_system *coding;
4417      const unsigned char *src;
4418      int src_bytes;
4419 {
4420   Lisp_Object val;
4421   int skip;
4422   int eol_type;
4423
4424   switch (coding->category_idx)
4425     {
4426     case CODING_CATEGORY_IDX_UTF_16_BE:
4427       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4428       break;
4429     case CODING_CATEGORY_IDX_UTF_16_LE:
4430       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4431       break;
4432     default:
4433       eol_type = detect_eol_type (src, src_bytes, &skip);
4434       break;
4435     }
4436
4437   if (coding->heading_ascii > skip)
4438     coding->heading_ascii = skip;
4439   else
4440     skip = coding->heading_ascii;
4441
4442   if (eol_type == CODING_EOL_UNDECIDED)
4443     return;
4444   if (eol_type == CODING_EOL_INCONSISTENT)
4445     {
4446 #if 0
4447       /* This code is suppressed until we find a better way to
4448          distinguish raw text file and binary file.  */
4449
4450       /* If we have already detected that the coding is raw-text, the
4451          coding should actually be no-conversion.  */
4452       if (coding->type == coding_type_raw_text)
4453         {
4454           setup_coding_system (Qno_conversion, coding);
4455           return;
4456         }
4457       /* Else, let's decode only text code anyway.  */
4458 #endif /* 0 */
4459       eol_type = CODING_EOL_LF;
4460     }
4461
4462   val = Fget (coding->symbol, Qeol_type);
4463   if (VECTORP (val) && XVECTOR (val)->size == 3)
4464     {
4465       int src_multibyte = coding->src_multibyte;
4466       int dst_multibyte = coding->dst_multibyte;
4467       struct composition_data *cmp_data = coding->cmp_data;
4468
4469       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4470       coding->src_multibyte = src_multibyte;
4471       coding->dst_multibyte = dst_multibyte;
4472       coding->heading_ascii = skip;
4473       coding->cmp_data = cmp_data;
4474     }
4475 }
4476
4477 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4478
4479 #define DECODING_BUFFER_MAG(coding)                     \
4480   (coding->type == coding_type_iso2022                  \
4481    ? 3                                                  \
4482    : (coding->type == coding_type_ccl                   \
4483       ? coding->spec.ccl.decoder.buf_magnification      \
4484       : 2))
4485
4486 /* Return maximum size (bytes) of a buffer enough for decoding
4487    SRC_BYTES of text encoded in CODING.  */
4488
4489 int
4490 decoding_buffer_size (coding, src_bytes)
4491      struct coding_system *coding;
4492      int src_bytes;
4493 {
4494   return (src_bytes * DECODING_BUFFER_MAG (coding)
4495           + CONVERSION_BUFFER_EXTRA_ROOM);
4496 }
4497
4498 /* Return maximum size (bytes) of a buffer enough for encoding
4499    SRC_BYTES of text to CODING.  */
4500
4501 int
4502 encoding_buffer_size (coding, src_bytes)
4503      struct coding_system *coding;
4504      int src_bytes;
4505 {
4506   int magnification;
4507
4508   if (coding->type == coding_type_ccl)
4509     {
4510       magnification = coding->spec.ccl.encoder.buf_magnification;
4511       if (coding->eol_type == CODING_EOL_CRLF)
4512         magnification *= 2;
4513     }
4514   else if (CODING_REQUIRE_ENCODING (coding))
4515     magnification = 3;
4516   else
4517     magnification = 1;
4518
4519   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4520 }
4521
4522 /* Working buffer for code conversion.  */
4523 struct conversion_buffer
4524 {
4525   int size;                     /* size of data.  */
4526   int on_stack;                 /* 1 if allocated by alloca.  */
4527   unsigned char *data;
4528 };
4529
4530 /* Don't use alloca for allocating memory space larger than this, lest
4531    we overflow their stack.  */
4532 #define MAX_ALLOCA 16*1024
4533
4534 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4535 #define allocate_conversion_buffer(buf, len)            \
4536   do {                                                  \
4537     if (len < MAX_ALLOCA)                               \
4538       {                                                 \
4539         buf.data = (unsigned char *) alloca (len);      \
4540         buf.on_stack = 1;                               \
4541       }                                                 \
4542     else                                                \
4543       {                                                 \
4544         buf.data = (unsigned char *) xmalloc (len);     \
4545         buf.on_stack = 0;                               \
4546       }                                                 \
4547     buf.size = len;                                     \
4548   } while (0)
4549
4550 /* Double the allocated memory for *BUF.  */
4551 static void
4552 extend_conversion_buffer (buf)
4553      struct conversion_buffer *buf;
4554 {
4555   if (buf->on_stack)
4556     {
4557       unsigned char *save = buf->data;
4558       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4559       bcopy (save, buf->data, buf->size);
4560       buf->on_stack = 0;
4561     }
4562   else
4563     {
4564       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4565     }
4566   buf->size *= 2;
4567 }
4568
4569 /* Free the allocated memory for BUF if it is not on stack.  */
4570 static void
4571 free_conversion_buffer (buf)
4572      struct conversion_buffer *buf;
4573 {
4574   if (!buf->on_stack)
4575     xfree (buf->data);
4576 }
4577
4578 int
4579 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4580      struct coding_system *coding;
4581      unsigned char *source, *destination;
4582      int src_bytes, dst_bytes, encodep;
4583 {
4584   struct ccl_program *ccl
4585     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4586   unsigned char *dst = destination;
4587
4588   ccl->suppress_error = coding->suppress_error;
4589   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4590   if (encodep)
4591     {
4592       /* On encoding, EOL format is converted within ccl_driver.  For
4593          that, setup proper information in the structure CCL.  */
4594       ccl->eol_type = coding->eol_type;
4595       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4596         ccl->eol_type = CODING_EOL_LF;
4597       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4598       ccl->eight_bit_control = coding->dst_multibyte;
4599     }
4600   else
4601     ccl->eight_bit_control = 1;
4602   ccl->multibyte = coding->src_multibyte;
4603   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4604     {
4605       /* Move carryover bytes to DESTINATION.  */
4606       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4607       while (*p)
4608         *dst++ = *p++;
4609       coding->spec.ccl.eight_bit_carryover[0] = 0;
4610       if (dst_bytes)
4611         dst_bytes -= dst - destination;
4612     }
4613
4614   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4615                                   &(coding->consumed))
4616                       + dst - destination);
4617
4618   if (encodep)
4619     {
4620       coding->produced_char = coding->produced;
4621       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4622     }
4623   else if (!ccl->eight_bit_control)
4624     {
4625       /* The produced bytes forms a valid multibyte sequence. */
4626       coding->produced_char
4627         = multibyte_chars_in_text (destination, coding->produced);
4628       coding->spec.ccl.eight_bit_carryover[0] = 0;
4629     }
4630   else
4631     {
4632       /* On decoding, the destination should always multibyte.  But,
4633          CCL program might have been generated an invalid multibyte
4634          sequence.  Here we make such a sequence valid as
4635          multibyte.  */
4636       int bytes
4637         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4638
4639       if ((coding->consumed < src_bytes
4640            || !ccl->last_block)
4641           && coding->produced >= 1
4642           && destination[coding->produced - 1] >= 0x80)
4643         {
4644           /* We should not convert the tailing 8-bit codes to
4645              multibyte form even if they doesn't form a valid
4646              multibyte sequence.  They may form a valid sequence in
4647              the next call.  */
4648           int carryover = 0;
4649
4650           if (destination[coding->produced - 1] < 0xA0)
4651             carryover = 1;
4652           else if (coding->produced >= 2)
4653             {
4654               if (destination[coding->produced - 2] >= 0x80)
4655                 {
4656                   if (destination[coding->produced - 2] < 0xA0)
4657                     carryover = 2;
4658                   else if (coding->produced >= 3
4659                            && destination[coding->produced - 3] >= 0x80
4660                            && destination[coding->produced - 3] < 0xA0)
4661                     carryover = 3;
4662                 }
4663             }
4664           if (carryover > 0)
4665             {
4666               BCOPY_SHORT (destination + coding->produced - carryover,
4667                            coding->spec.ccl.eight_bit_carryover,
4668                            carryover);
4669               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4670               coding->produced -= carryover;
4671             }
4672         }
4673       coding->produced = str_as_multibyte (destination, bytes,
4674                                            coding->produced,
4675                                            &(coding->produced_char));
4676     }
4677
4678   switch (ccl->status)
4679     {
4680     case CCL_STAT_SUSPEND_BY_SRC:
4681       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4682       break;
4683     case CCL_STAT_SUSPEND_BY_DST:
4684       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4685       break;
4686     case CCL_STAT_QUIT:
4687     case CCL_STAT_INVALID_CMD:
4688       coding->result = CODING_FINISH_INTERRUPT;
4689       break;
4690     default:
4691       coding->result = CODING_FINISH_NORMAL;
4692       break;
4693     }
4694   return coding->result;
4695 }
4696
4697 /* Decode EOL format of the text at PTR of BYTES length destructively
4698    according to CODING->eol_type.  This is called after the CCL
4699    program produced a decoded text at PTR.  If we do CRLF->LF
4700    conversion, update CODING->produced and CODING->produced_char.  */
4701
4702 static void
4703 decode_eol_post_ccl (coding, ptr, bytes)
4704      struct coding_system *coding;
4705      unsigned char *ptr;
4706      int bytes;
4707 {
4708   Lisp_Object val, saved_coding_symbol;
4709   unsigned char *pend = ptr + bytes;
4710   int dummy;
4711
4712   /* Remember the current coding system symbol.  We set it back when
4713      an inconsistent EOL is found so that `last-coding-system-used' is
4714      set to the coding system that doesn't specify EOL conversion.  */
4715   saved_coding_symbol = coding->symbol;
4716
4717   coding->spec.ccl.cr_carryover = 0;
4718   if (coding->eol_type == CODING_EOL_UNDECIDED)
4719     {
4720       /* Here, to avoid the call of setup_coding_system, we directly
4721          call detect_eol_type.  */
4722       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4723       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4724         coding->eol_type = CODING_EOL_LF;
4725       if (coding->eol_type != CODING_EOL_UNDECIDED)
4726         {
4727           val = Fget (coding->symbol, Qeol_type);
4728           if (VECTORP (val) && XVECTOR (val)->size == 3)
4729             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4730         }
4731       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4732     }
4733
4734   if (coding->eol_type == CODING_EOL_LF
4735       || coding->eol_type == CODING_EOL_UNDECIDED)
4736     {
4737       /* We have nothing to do.  */
4738       ptr = pend;
4739     }
4740   else if (coding->eol_type == CODING_EOL_CRLF)
4741     {
4742       unsigned char *pstart = ptr, *p = ptr;
4743
4744       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4745           && *(pend - 1) == '\r')
4746         {
4747           /* If the last character is CR, we can't handle it here
4748              because LF will be in the not-yet-decoded source text.
4749              Record that the CR is not yet processed.  */
4750           coding->spec.ccl.cr_carryover = 1;
4751           coding->produced--;
4752           coding->produced_char--;
4753           pend--;
4754         }
4755       while (ptr < pend)
4756         {
4757           if (*ptr == '\r')
4758             {
4759               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4760                 {
4761                   *p++ = '\n';
4762                   ptr += 2;
4763                 }
4764               else
4765                 {
4766                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4767                     goto undo_eol_conversion;
4768                   *p++ = *ptr++;
4769                 }
4770             }
4771           else if (*ptr == '\n'
4772                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4773             goto undo_eol_conversion;
4774           else
4775             *p++ = *ptr++;
4776           continue;
4777
4778         undo_eol_conversion:
4779           /* We have faced with inconsistent EOL format at PTR.
4780              Convert all LFs before PTR back to CRLFs.  */
4781           for (p--, ptr--; p >= pstart; p--)
4782             {
4783               if (*p == '\n')
4784                 *ptr-- = '\n', *ptr-- = '\r';
4785               else
4786                 *ptr-- = *p;
4787             }
4788           /*  If carryover is recorded, cancel it because we don't
4789               convert CRLF anymore.  */
4790           if (coding->spec.ccl.cr_carryover)
4791             {
4792               coding->spec.ccl.cr_carryover = 0;
4793               coding->produced++;
4794               coding->produced_char++;
4795               pend++;
4796             }
4797           p = ptr = pend;
4798           coding->eol_type = CODING_EOL_LF;
4799           coding->symbol = saved_coding_symbol;
4800         }
4801       if (p < pend)
4802         {
4803           /* As each two-byte sequence CRLF was converted to LF, (PEND
4804              - P) is the number of deleted characters.  */
4805           coding->produced -= pend - p;
4806           coding->produced_char -= pend - p;
4807         }
4808     }
4809   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4810     {
4811       unsigned char *p = ptr;
4812
4813       for (; ptr < pend; ptr++)
4814         {
4815           if (*ptr == '\r')
4816             *ptr = '\n';
4817           else if (*ptr == '\n'
4818                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4819             {
4820               for (; p < ptr; p++)
4821                 {
4822                   if (*p == '\n')
4823                     *p = '\r';
4824                 }
4825               ptr = pend;
4826               coding->eol_type = CODING_EOL_LF;
4827               coding->symbol = saved_coding_symbol;
4828             }
4829         }
4830     }
4831 }
4832
4833 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4834    decoding, it may detect coding system and format of end-of-line if
4835    those are not yet decided.  The source should be unibyte, the
4836    result is multibyte if CODING->dst_multibyte is nonzero, else
4837    unibyte.  */
4838
4839 int
4840 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4841      struct coding_system *coding;
4842      const unsigned char *source;
4843      unsigned char *destination;
4844      int src_bytes, dst_bytes;
4845 {
4846   int extra = 0;
4847
4848   if (coding->type == coding_type_undecided)
4849     detect_coding (coding, source, src_bytes);
4850
4851   if (coding->eol_type == CODING_EOL_UNDECIDED
4852       && coding->type != coding_type_ccl)
4853     {
4854       detect_eol (coding, source, src_bytes);
4855       /* We had better recover the original eol format if we
4856          encounter an inconsistent eol format while decoding.  */
4857       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4858     }
4859
4860   coding->produced = coding->produced_char = 0;
4861   coding->consumed = coding->consumed_char = 0;
4862   coding->errors = 0;
4863   coding->result = CODING_FINISH_NORMAL;
4864
4865   switch (coding->type)
4866     {
4867     case coding_type_sjis:
4868       decode_coding_sjis_big5 (coding, source, destination,
4869                                src_bytes, dst_bytes, 1);
4870       break;
4871
4872     case coding_type_iso2022:
4873       decode_coding_iso2022 (coding, source, destination,
4874                              src_bytes, dst_bytes);
4875       break;
4876
4877     case coding_type_big5:
4878       decode_coding_sjis_big5 (coding, source, destination,
4879                                src_bytes, dst_bytes, 0);
4880       break;
4881
4882     case coding_type_emacs_mule:
4883       decode_coding_emacs_mule (coding, source, destination,
4884                                 src_bytes, dst_bytes);
4885       break;
4886
4887     case coding_type_ccl:
4888       if (coding->spec.ccl.cr_carryover)
4889         {
4890           /* Put the CR which was not processed by the previous call
4891              of decode_eol_post_ccl in DESTINATION.  It will be
4892              decoded together with the following LF by the call to
4893              decode_eol_post_ccl below.  */
4894           *destination = '\r';
4895           coding->produced++;
4896           coding->produced_char++;
4897           dst_bytes--;
4898           extra = coding->spec.ccl.cr_carryover;
4899         }
4900       ccl_coding_driver (coding, source, destination + extra,
4901                          src_bytes, dst_bytes, 0);
4902       if (coding->eol_type != CODING_EOL_LF)
4903         {
4904           coding->produced += extra;
4905           coding->produced_char += extra;
4906           decode_eol_post_ccl (coding, destination, coding->produced);
4907         }
4908       break;
4909
4910     default:
4911       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4912     }
4913
4914   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4915       && coding->mode & CODING_MODE_LAST_BLOCK
4916       && coding->consumed == src_bytes)
4917     coding->result = CODING_FINISH_NORMAL;
4918
4919   if (coding->mode & CODING_MODE_LAST_BLOCK
4920       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4921     {
4922       const unsigned char *src = source + coding->consumed;
4923       unsigned char *dst = destination + coding->produced;
4924
4925       src_bytes -= coding->consumed;
4926       coding->errors++;
4927       if (COMPOSING_P (coding))
4928         DECODE_COMPOSITION_END ('1');
4929       while (src_bytes--)
4930         {
4931           int c = *src++;
4932           dst += CHAR_STRING (c, dst);
4933           coding->produced_char++;
4934         }
4935       coding->consumed = coding->consumed_char = src - source;
4936       coding->produced = dst - destination;
4937       coding->result = CODING_FINISH_NORMAL;
4938     }
4939
4940   if (!coding->dst_multibyte)
4941     {
4942       coding->produced = str_as_unibyte (destination, coding->produced);
4943       coding->produced_char = coding->produced;
4944     }
4945
4946   return coding->result;
4947 }
4948
4949 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4950    multibyteness of the source is CODING->src_multibyte, the
4951    multibyteness of the result is always unibyte.  */
4952
4953 int
4954 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4955      struct coding_system *coding;
4956      const unsigned char *source;
4957      unsigned char *destination;
4958      int src_bytes, dst_bytes;
4959 {
4960   coding->produced = coding->produced_char = 0;
4961   coding->consumed = coding->consumed_char = 0;
4962   coding->errors = 0;
4963   coding->result = CODING_FINISH_NORMAL;
4964
4965   switch (coding->type)
4966     {
4967     case coding_type_sjis:
4968       encode_coding_sjis_big5 (coding, source, destination,
4969                                src_bytes, dst_bytes, 1);
4970       break;
4971
4972     case coding_type_iso2022:
4973       encode_coding_iso2022 (coding, source, destination,
4974                              src_bytes, dst_bytes);
4975       break;
4976
4977     case coding_type_big5:
4978       encode_coding_sjis_big5 (coding, source, destination,
4979                                src_bytes, dst_bytes, 0);
4980       break;
4981
4982     case coding_type_emacs_mule:
4983       encode_coding_emacs_mule (coding, source, destination,
4984                                 src_bytes, dst_bytes);
4985       break;
4986
4987     case coding_type_ccl:
4988       ccl_coding_driver (coding, source, destination,
4989                          src_bytes, dst_bytes, 1);
4990       break;
4991
4992     default:
4993       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4994     }
4995
4996   if (coding->mode & CODING_MODE_LAST_BLOCK
4997       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4998     {
4999       const unsigned char *src = source + coding->consumed;
5000       unsigned char *dst = destination + coding->produced;
5001
5002       if (coding->type == coding_type_iso2022)
5003         ENCODE_RESET_PLANE_AND_REGISTER;
5004       if (COMPOSING_P (coding))
5005         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5006       if (coding->consumed < src_bytes)
5007         {
5008           int len = src_bytes - coding->consumed;
5009
5010           BCOPY_SHORT (src, dst, len);
5011           if (coding->src_multibyte)
5012             len = str_as_unibyte (dst, len);
5013           dst += len;
5014           coding->consumed = src_bytes;
5015         }
5016       coding->produced = coding->produced_char = dst - destination;
5017       coding->result = CODING_FINISH_NORMAL;
5018     }
5019
5020   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5021       && coding->consumed == src_bytes)
5022     coding->result = CODING_FINISH_NORMAL;
5023
5024   return coding->result;
5025 }
5026
5027 /* Scan text in the region between *BEG and *END (byte positions),
5028    skip characters which we don't have to decode by coding system
5029    CODING at the head and tail, then set *BEG and *END to the region
5030    of the text we actually have to convert.  The caller should move
5031    the gap out of the region in advance if the region is from a
5032    buffer.
5033
5034    If STR is not NULL, *BEG and *END are indices into STR.  */
5035
5036 static void
5037 shrink_decoding_region (beg, end, coding, str)
5038      int *beg, *end;
5039      struct coding_system *coding;
5040      unsigned char *str;
5041 {
5042   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5043   int eol_conversion;
5044   Lisp_Object translation_table;
5045
5046   if (coding->type == coding_type_ccl
5047       || coding->type == coding_type_undecided
5048       || coding->eol_type != CODING_EOL_LF
5049       || !NILP (coding->post_read_conversion)
5050       || coding->composing != COMPOSITION_DISABLED)
5051     {
5052       /* We can't skip any data.  */
5053       return;
5054     }
5055   if (coding->type == coding_type_no_conversion
5056       || coding->type == coding_type_raw_text
5057       || coding->type == coding_type_emacs_mule)
5058     {
5059       /* We need no conversion, but don't have to skip any data here.
5060          Decoding routine handles them effectively anyway.  */
5061       return;
5062     }
5063
5064   translation_table = coding->translation_table_for_decode;
5065   if (NILP (translation_table) && !NILP (Venable_character_translation))
5066     translation_table = Vstandard_translation_table_for_decode;
5067   if (CHAR_TABLE_P (translation_table))
5068     {
5069       int i;
5070       for (i = 0; i < 128; i++)
5071         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5072           break;
5073       if (i < 128)
5074         /* Some ASCII character should be translated.  We give up
5075            shrinking.  */
5076         return;
5077     }
5078
5079   if (coding->heading_ascii >= 0)
5080     /* Detection routine has already found how much we can skip at the
5081        head.  */
5082     *beg += coding->heading_ascii;
5083
5084   if (str)
5085     {
5086       begp_orig = begp = str + *beg;
5087       endp_orig = endp = str + *end;
5088     }
5089   else
5090     {
5091       begp_orig = begp = BYTE_POS_ADDR (*beg);
5092       endp_orig = endp = begp + *end - *beg;
5093     }
5094
5095   eol_conversion = (coding->eol_type == CODING_EOL_CR
5096                     || coding->eol_type == CODING_EOL_CRLF);
5097
5098   switch (coding->type)
5099     {
5100     case coding_type_sjis:
5101     case coding_type_big5:
5102       /* We can skip all ASCII characters at the head.  */
5103       if (coding->heading_ascii < 0)
5104         {
5105           if (eol_conversion)
5106             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5107           else
5108             while (begp < endp && *begp < 0x80) begp++;
5109         }
5110       /* We can skip all ASCII characters at the tail except for the
5111          second byte of SJIS or BIG5 code.  */
5112       if (eol_conversion)
5113         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5114       else
5115         while (begp < endp && endp[-1] < 0x80) endp--;
5116       /* Do not consider LF as ascii if preceded by CR, since that
5117          confuses eol decoding. */
5118       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5119         endp++;
5120       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5121         endp++;
5122       break;
5123
5124     case coding_type_iso2022:
5125       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5126         /* We can't skip any data.  */
5127         break;
5128       if (coding->heading_ascii < 0)
5129         {
5130           /* We can skip all ASCII characters at the head except for a
5131              few control codes.  */
5132           while (begp < endp && (c = *begp) < 0x80
5133                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5134                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5135                  && (!eol_conversion || c != ISO_CODE_LF))
5136             begp++;
5137         }
5138       switch (coding->category_idx)
5139         {
5140         case CODING_CATEGORY_IDX_ISO_8_1:
5141         case CODING_CATEGORY_IDX_ISO_8_2:
5142           /* We can skip all ASCII characters at the tail.  */
5143           if (eol_conversion)
5144             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5145           else
5146             while (begp < endp && endp[-1] < 0x80) endp--;
5147           /* Do not consider LF as ascii if preceded by CR, since that
5148              confuses eol decoding. */
5149           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5150             endp++;
5151           break;
5152
5153         case CODING_CATEGORY_IDX_ISO_7:
5154         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5155           {
5156             /* We can skip all characters at the tail except for 8-bit
5157                codes and ESC and the following 2-byte at the tail.  */
5158             unsigned char *eight_bit = NULL;
5159
5160             if (eol_conversion)
5161               while (begp < endp
5162                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5163                 {
5164                   if (!eight_bit && c & 0x80) eight_bit = endp;
5165                   endp--;
5166                 }
5167             else
5168               while (begp < endp
5169                      && (c = endp[-1]) != ISO_CODE_ESC)
5170                 {
5171                   if (!eight_bit && c & 0x80) eight_bit = endp;
5172                   endp--;
5173                 }
5174             /* Do not consider LF as ascii if preceded by CR, since that
5175                confuses eol decoding. */
5176             if (begp < endp && endp < endp_orig
5177                 && endp[-1] == '\r' && endp[0] == '\n')
5178               endp++;
5179             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5180               {
5181                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5182                   /* This is an ASCII designation sequence.  We can
5183                      surely skip the tail.  But, if we have
5184                      encountered an 8-bit code, skip only the codes
5185                      after that.  */
5186                   endp = eight_bit ? eight_bit : endp + 2;
5187                 else
5188                   /* Hmmm, we can't skip the tail.  */
5189                   endp = endp_orig;
5190               }
5191             else if (eight_bit)
5192               endp = eight_bit;
5193           }
5194         }
5195       break;
5196
5197     default:
5198       abort ();
5199     }
5200   *beg += begp - begp_orig;
5201   *end += endp - endp_orig;
5202   return;
5203 }
5204
5205 /* Like shrink_decoding_region but for encoding.  */
5206
5207 static void
5208 shrink_encoding_region (beg, end, coding, str)
5209      int *beg, *end;
5210      struct coding_system *coding;
5211      unsigned char *str;
5212 {
5213   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5214   int eol_conversion;
5215   Lisp_Object translation_table;
5216
5217   if (coding->type == coding_type_ccl
5218       || coding->eol_type == CODING_EOL_CRLF
5219       || coding->eol_type == CODING_EOL_CR
5220       || (coding->cmp_data && coding->cmp_data->used > 0))
5221     {
5222       /* We can't skip any data.  */
5223       return;
5224     }
5225   if (coding->type == coding_type_no_conversion
5226       || coding->type == coding_type_raw_text
5227       || coding->type == coding_type_emacs_mule
5228       || coding->type == coding_type_undecided)
5229     {
5230       /* We need no conversion, but don't have to skip any data here.
5231          Encoding routine handles them effectively anyway.  */
5232       return;
5233     }
5234
5235   translation_table = coding->translation_table_for_encode;
5236   if (NILP (translation_table) && !NILP (Venable_character_translation))
5237     translation_table = Vstandard_translation_table_for_encode;
5238   if (CHAR_TABLE_P (translation_table))
5239     {
5240       int i;
5241       for (i = 0; i < 128; i++)
5242         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5243           break;
5244       if (i < 128)
5245         /* Some ASCII character should be translated.  We give up
5246            shrinking.  */
5247         return;
5248     }
5249
5250   if (str)
5251     {
5252       begp_orig = begp = str + *beg;
5253       endp_orig = endp = str + *end;
5254     }
5255   else
5256     {
5257       begp_orig = begp = BYTE_POS_ADDR (*beg);
5258       endp_orig = endp = begp + *end - *beg;
5259     }
5260
5261   eol_conversion = (coding->eol_type == CODING_EOL_CR
5262                     || coding->eol_type == CODING_EOL_CRLF);
5263
5264   /* Here, we don't have to check coding->pre_write_conversion because
5265      the caller is expected to have handled it already.  */
5266   switch (coding->type)
5267     {
5268     case coding_type_iso2022:
5269       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5270         /* We can't skip any data.  */
5271         break;
5272       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5273         {
5274           unsigned char *bol = begp;
5275           while (begp < endp && *begp < 0x80)
5276             {
5277               begp++;
5278               if (begp[-1] == '\n')
5279                 bol = begp;
5280             }
5281           begp = bol;
5282           goto label_skip_tail;
5283         }
5284       /* fall down ... */
5285
5286     case coding_type_sjis:
5287     case coding_type_big5:
5288       /* We can skip all ASCII characters at the head and tail.  */
5289       if (eol_conversion)
5290         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5291       else
5292         while (begp < endp && *begp < 0x80) begp++;
5293     label_skip_tail:
5294       if (eol_conversion)
5295         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5296       else
5297         while (begp < endp && *(endp - 1) < 0x80) endp--;
5298       break;
5299
5300     default:
5301       abort ();
5302     }
5303
5304   *beg += begp - begp_orig;
5305   *end += endp - endp_orig;
5306   return;
5307 }
5308
5309 /* As shrinking conversion region requires some overhead, we don't try
5310    shrinking if the length of conversion region is less than this
5311    value.  */
5312 static int shrink_conversion_region_threshhold = 1024;
5313
5314 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5315   do {                                                                  \
5316     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5317       {                                                                 \
5318         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5319         else shrink_decoding_region (beg, end, coding, str);            \
5320       }                                                                 \
5321   } while (0)
5322
5323 static Lisp_Object
5324 code_convert_region_unwind (arg)
5325      Lisp_Object arg;
5326 {
5327   inhibit_pre_post_conversion = 0;
5328   Vlast_coding_system_used = arg;
5329   return Qnil;
5330 }
5331
5332 /* Store information about all compositions in the range FROM and TO
5333    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5334    buffer or a string, defaults to the current buffer.  */
5335
5336 void
5337 coding_save_composition (coding, from, to, obj)
5338      struct coding_system *coding;
5339      int from, to;
5340      Lisp_Object obj;
5341 {
5342   Lisp_Object prop;
5343   int start, end;
5344
5345   if (coding->composing == COMPOSITION_DISABLED)
5346     return;
5347   if (!coding->cmp_data)
5348     coding_allocate_composition_data (coding, from);
5349   if (!find_composition (from, to, &start, &end, &prop, obj)
5350       || end > to)
5351     return;
5352   if (start < from
5353       && (!find_composition (end, to, &start, &end, &prop, obj)
5354           || end > to))
5355     return;
5356   coding->composing = COMPOSITION_NO;
5357   do
5358     {
5359       if (COMPOSITION_VALID_P (start, end, prop))
5360         {
5361           enum composition_method method = COMPOSITION_METHOD (prop);
5362           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5363               >= COMPOSITION_DATA_SIZE)
5364             coding_allocate_composition_data (coding, from);
5365           /* For relative composition, we remember start and end
5366              positions, for the other compositions, we also remember
5367              components.  */
5368           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5369           if (method != COMPOSITION_RELATIVE)
5370             {
5371               /* We must store a*/
5372               Lisp_Object val, ch;
5373
5374               val = COMPOSITION_COMPONENTS (prop);
5375               if (CONSP (val))
5376                 while (CONSP (val))
5377                   {
5378                     ch = XCAR (val), val = XCDR (val);
5379                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5380                   }
5381               else if (VECTORP (val) || STRINGP (val))
5382                 {
5383                   int len = (VECTORP (val)
5384                              ? XVECTOR (val)->size : SCHARS (val));
5385                   int i;
5386                   for (i = 0; i < len; i++)
5387                     {
5388                       ch = (STRINGP (val)
5389                             ? Faref (val, make_number (i))
5390                             : XVECTOR (val)->contents[i]);
5391                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5392                     }
5393                 }
5394               else              /* INTEGERP (val) */
5395                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5396             }
5397           CODING_ADD_COMPOSITION_END (coding, end - from);
5398         }
5399       start = end;
5400     }
5401   while (start < to
5402          && find_composition (start, to, &start, &end, &prop, obj)
5403          && end <= to);
5404
5405   /* Make coding->cmp_data point to the first memory block.  */
5406   while (coding->cmp_data->prev)
5407     coding->cmp_data = coding->cmp_data->prev;
5408   coding->cmp_data_start = 0;
5409 }
5410
5411 /* Reflect the saved information about compositions to OBJ.
5412    CODING->cmp_data points to a memory block for the information.  OBJ
5413    is a buffer or a string, defaults to the current buffer.  */
5414
5415 void
5416 coding_restore_composition (coding, obj)
5417      struct coding_system *coding;
5418      Lisp_Object obj;
5419 {
5420   struct composition_data *cmp_data = coding->cmp_data;
5421
5422   if (!cmp_data)
5423     return;
5424
5425   while (cmp_data->prev)
5426     cmp_data = cmp_data->prev;
5427
5428   while (cmp_data)
5429     {
5430       int i;
5431
5432       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5433            i += cmp_data->data[i])
5434         {
5435           int *data = cmp_data->data + i;
5436           enum composition_method method = (enum composition_method) data[3];
5437           Lisp_Object components;
5438
5439           if (method == COMPOSITION_RELATIVE)
5440             components = Qnil;
5441           else
5442             {
5443               int len = data[0] - 4, j;
5444               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5445
5446               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5447                   && len % 2 == 0)
5448                 len --;
5449               for (j = 0; j < len; j++)
5450                 args[j] = make_number (data[4 + j]);
5451               components = (method == COMPOSITION_WITH_ALTCHARS
5452                             ? Fstring (len, args) : Fvector (len, args));
5453             }
5454           compose_text (data[1], data[2], components, Qnil, obj);
5455         }
5456       cmp_data = cmp_data->next;
5457     }
5458 }
5459
5460 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5461    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5462    coding system CODING, and return the status code of code conversion
5463    (currently, this value has no meaning).
5464
5465    How many characters (and bytes) are converted to how many
5466    characters (and bytes) are recorded in members of the structure
5467    CODING.
5468
5469    If REPLACE is nonzero, we do various things as if the original text
5470    is deleted and a new text is inserted.  See the comments in
5471    replace_range (insdel.c) to know what we are doing.
5472
5473    If REPLACE is zero, it is assumed that the source text is unibyte.
5474    Otherwise, it is assumed that the source text is multibyte.  */
5475
5476 int
5477 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5478      int from, from_byte, to, to_byte, encodep, replace;
5479      struct coding_system *coding;
5480 {
5481   int len = to - from, len_byte = to_byte - from_byte;
5482   int nchars_del = 0, nbytes_del = 0;
5483   int require, inserted, inserted_byte;
5484   int head_skip, tail_skip, total_skip = 0;
5485   Lisp_Object saved_coding_symbol;
5486   int first = 1;
5487   unsigned char *src, *dst;
5488   Lisp_Object deletion;
5489   int orig_point = PT, orig_len = len;
5490   int prev_Z;
5491   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5492
5493   deletion = Qnil;
5494   saved_coding_symbol = coding->symbol;
5495
5496   if (from < PT && PT < to)
5497     {
5498       TEMP_SET_PT_BOTH (from, from_byte);
5499       orig_point = from;
5500     }
5501
5502   if (replace)
5503     {
5504       int saved_from = from;
5505       int saved_inhibit_modification_hooks;
5506
5507       prepare_to_modify_buffer (from, to, &from);
5508       if (saved_from != from)
5509         {
5510           to = from + len;
5511           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5512           len_byte = to_byte - from_byte;
5513         }
5514
5515       /* The code conversion routine can not preserve text properties
5516          for now.  So, we must remove all text properties in the
5517          region.  Here, we must suppress all modification hooks.  */
5518       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5519       inhibit_modification_hooks = 1;
5520       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5521       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5522     }
5523
5524   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5525     {
5526       /* We must detect encoding of text and eol format.  */
5527
5528       if (from < GPT && to > GPT)
5529         move_gap_both (from, from_byte);
5530       if (coding->type == coding_type_undecided)
5531         {
5532           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5533           if (coding->type == coding_type_undecided)
5534             {
5535               /* It seems that the text contains only ASCII, but we
5536                  should not leave it undecided because the deeper
5537                  decoding routine (decode_coding) tries to detect the
5538                  encodings again in vain.  */
5539               coding->type = coding_type_emacs_mule;
5540               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5541               /* As emacs-mule decoder will handle composition, we
5542                  need this setting to allocate coding->cmp_data
5543                  later.  */
5544               coding->composing = COMPOSITION_NO;
5545             }
5546         }
5547       if (coding->eol_type == CODING_EOL_UNDECIDED
5548           && coding->type != coding_type_ccl)
5549         {
5550           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5551           if (coding->eol_type == CODING_EOL_UNDECIDED)
5552             coding->eol_type = CODING_EOL_LF;
5553           /* We had better recover the original eol format if we
5554              encounter an inconsistent eol format while decoding.  */
5555           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5556         }
5557     }
5558
5559   /* Now we convert the text.  */
5560
5561   /* For encoding, we must process pre-write-conversion in advance.  */
5562   if (! inhibit_pre_post_conversion
5563       && encodep
5564       && SYMBOLP (coding->pre_write_conversion)
5565       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5566     {
5567       /* The function in pre-write-conversion may put a new text in a
5568          new buffer.  */
5569       struct buffer *prev = current_buffer;
5570       Lisp_Object new;
5571
5572       record_unwind_protect (code_convert_region_unwind,
5573                              Vlast_coding_system_used);
5574       /* We should not call any more pre-write/post-read-conversion
5575          functions while this pre-write-conversion is running.  */
5576       inhibit_pre_post_conversion = 1;
5577       call2 (coding->pre_write_conversion,
5578              make_number (from), make_number (to));
5579       inhibit_pre_post_conversion = 0;
5580       /* Discard the unwind protect.  */
5581       specpdl_ptr--;
5582
5583       if (current_buffer != prev)
5584         {
5585           len = ZV - BEGV;
5586           new = Fcurrent_buffer ();
5587           set_buffer_internal_1 (prev);
5588           del_range_2 (from, from_byte, to, to_byte, 0);
5589           TEMP_SET_PT_BOTH (from, from_byte);
5590           insert_from_buffer (XBUFFER (new), 1, len, 0);
5591           Fkill_buffer (new);
5592           if (orig_point >= to)
5593             orig_point += len - orig_len;
5594           else if (orig_point > from)
5595             orig_point = from;
5596           orig_len = len;
5597           to = from + len;
5598           from_byte = CHAR_TO_BYTE (from);
5599           to_byte = CHAR_TO_BYTE (to);
5600           len_byte = to_byte - from_byte;
5601           TEMP_SET_PT_BOTH (from, from_byte);
5602         }
5603     }
5604
5605   if (replace)
5606     {
5607       if (! EQ (current_buffer->undo_list, Qt))
5608         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5609       else
5610         {
5611           nchars_del = to - from;
5612           nbytes_del = to_byte - from_byte;
5613         }
5614     }
5615
5616   if (coding->composing != COMPOSITION_DISABLED)
5617     {
5618       if (encodep)
5619         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5620       else
5621         coding_allocate_composition_data (coding, from);
5622     }
5623
5624   /* Try to skip the heading and tailing ASCIIs.  */
5625   if (coding->type != coding_type_ccl)
5626     {
5627       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5628
5629       if (from < GPT && GPT < to)
5630         move_gap_both (from, from_byte);
5631       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5632       if (from_byte == to_byte
5633           && (encodep || NILP (coding->post_read_conversion))
5634           && ! CODING_REQUIRE_FLUSHING (coding))
5635         {
5636           coding->produced = len_byte;
5637           coding->produced_char = len;
5638           if (!replace)
5639             /* We must record and adjust for this new text now.  */
5640             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5641           return 0;
5642         }
5643
5644       head_skip = from_byte - from_byte_orig;
5645       tail_skip = to_byte_orig - to_byte;
5646       total_skip = head_skip + tail_skip;
5647       from += head_skip;
5648       to -= tail_skip;
5649       len -= total_skip; len_byte -= total_skip;
5650     }
5651
5652   /* For conversion, we must put the gap before the text in addition to
5653      making the gap larger for efficient decoding.  The required gap
5654      size starts from 2000 which is the magic number used in make_gap.
5655      But, after one batch of conversion, it will be incremented if we
5656      find that it is not enough .  */
5657   require = 2000;
5658
5659   if (GAP_SIZE  < require)
5660     make_gap (require - GAP_SIZE);
5661   move_gap_both (from, from_byte);
5662
5663   inserted = inserted_byte = 0;
5664
5665   GAP_SIZE += len_byte;
5666   ZV -= len;
5667   Z -= len;
5668   ZV_BYTE -= len_byte;
5669   Z_BYTE -= len_byte;
5670
5671   if (GPT - BEG < BEG_UNCHANGED)
5672     BEG_UNCHANGED = GPT - BEG;
5673   if (Z - GPT < END_UNCHANGED)
5674     END_UNCHANGED = Z - GPT;
5675
5676   if (!encodep && coding->src_multibyte)
5677     {
5678       /* Decoding routines expects that the source text is unibyte.
5679          We must convert 8-bit characters of multibyte form to
5680          unibyte.  */
5681       int len_byte_orig = len_byte;
5682       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5683       if (len_byte < len_byte_orig)
5684         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5685                     len_byte);
5686       coding->src_multibyte = 0;
5687     }
5688
5689   for (;;)
5690     {
5691       int result;
5692
5693       /* The buffer memory is now:
5694          +--------+converted-text+---------+-------original-text-------+---+
5695          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5696                   |<---------------------- GAP ----------------------->|  */
5697       src = GAP_END_ADDR - len_byte;
5698       dst = GPT_ADDR + inserted_byte;
5699
5700       if (encodep)
5701         result = encode_coding (coding, src, dst, len_byte, 0);
5702       else
5703         {
5704           if (coding->composing != COMPOSITION_DISABLED)
5705             coding->cmp_data->char_offset = from + inserted;
5706           result = decode_coding (coding, src, dst, len_byte, 0);
5707         }
5708
5709       /* The buffer memory is now:
5710          +--------+-------converted-text----+--+------original-text----+---+
5711          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5712                   |<---------------------- GAP ----------------------->|  */
5713
5714       inserted += coding->produced_char;
5715       inserted_byte += coding->produced;
5716       len_byte -= coding->consumed;
5717
5718       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5719         {
5720           coding_allocate_composition_data (coding, from + inserted);
5721           continue;
5722         }
5723
5724       src += coding->consumed;
5725       dst += coding->produced;
5726
5727       if (result == CODING_FINISH_NORMAL)
5728         {
5729           src += len_byte;
5730           break;
5731         }
5732       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5733         {
5734           unsigned char *pend = dst, *p = pend - inserted_byte;
5735           Lisp_Object eol_type;
5736
5737           /* Encode LFs back to the original eol format (CR or CRLF).  */
5738           if (coding->eol_type == CODING_EOL_CR)
5739             {
5740               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5741             }
5742           else
5743             {
5744               int count = 0;
5745
5746               while (p < pend) if (*p++ == '\n') count++;
5747               if (src - dst < count)
5748                 {
5749                   /* We don't have sufficient room for encoding LFs
5750                      back to CRLF.  We must record converted and
5751                      not-yet-converted text back to the buffer
5752                      content, enlarge the gap, then record them out of
5753                      the buffer contents again.  */
5754                   int add = len_byte + inserted_byte;
5755
5756                   GAP_SIZE -= add;
5757                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5758                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5759                   make_gap (count - GAP_SIZE);
5760                   GAP_SIZE += add;
5761                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5762                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5763                   /* Don't forget to update SRC, DST, and PEND.  */
5764                   src = GAP_END_ADDR - len_byte;
5765                   dst = GPT_ADDR + inserted_byte;
5766                   pend = dst;
5767                 }
5768               inserted += count;
5769               inserted_byte += count;
5770               coding->produced += count;
5771               p = dst = pend + count;
5772               while (count)
5773                 {
5774                   *--p = *--pend;
5775                   if (*p == '\n') count--, *--p = '\r';
5776                 }
5777             }
5778
5779           /* Suppress eol-format conversion in the further conversion.  */
5780           coding->eol_type = CODING_EOL_LF;
5781
5782           /* Set the coding system symbol to that for Unix-like EOL.  */
5783           eol_type = Fget (saved_coding_symbol, Qeol_type);
5784           if (VECTORP (eol_type)
5785               && XVECTOR (eol_type)->size == 3
5786               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5787             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5788           else
5789             coding->symbol = saved_coding_symbol;
5790
5791           continue;
5792         }
5793       if (len_byte <= 0)
5794         {
5795           if (coding->type != coding_type_ccl
5796               || coding->mode & CODING_MODE_LAST_BLOCK)
5797             break;
5798           coding->mode |= CODING_MODE_LAST_BLOCK;
5799           continue;
5800         }
5801       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5802         {
5803           /* The source text ends in invalid codes.  Let's just
5804              make them valid buffer contents, and finish conversion.  */
5805           if (multibyte_p)
5806             {
5807               unsigned char *start = dst;
5808
5809               inserted += len_byte;
5810               while (len_byte--)
5811                 {
5812                   int c = *src++;
5813                   dst += CHAR_STRING (c, dst);
5814                 }
5815
5816               inserted_byte += dst - start;
5817             }
5818           else
5819             {
5820               inserted += len_byte;
5821               inserted_byte += len_byte;
5822               while (len_byte--)
5823                 *dst++ = *src++;
5824             }
5825           break;
5826         }
5827       if (result == CODING_FINISH_INTERRUPT)
5828         {
5829           /* The conversion procedure was interrupted by a user.  */
5830           break;
5831         }
5832       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5833       if (coding->consumed < 1)
5834         {
5835           /* It's quite strange to require more memory without
5836              consuming any bytes.  Perhaps CCL program bug.  */
5837           break;
5838         }
5839       if (first)
5840         {
5841           /* We have just done the first batch of conversion which was
5842              stopped because of insufficient gap.  Let's reconsider the
5843              required gap size (i.e. SRT - DST) now.
5844
5845              We have converted ORIG bytes (== coding->consumed) into
5846              NEW bytes (coding->produced).  To convert the remaining
5847              LEN bytes, we may need REQUIRE bytes of gap, where:
5848                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5849                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5850              Here, we are sure that NEW >= ORIG.  */
5851           float ratio;
5852
5853           if (coding->produced <= coding->consumed)
5854             {
5855               /* This happens because of CCL-based coding system with
5856                  eol-type CRLF.  */
5857               require = 0;
5858             }
5859           else
5860             {
5861               ratio = (coding->produced - coding->consumed) / coding->consumed;
5862               require = len_byte * ratio;
5863             }
5864           first = 0;
5865         }
5866       if ((src - dst) < (require + 2000))
5867         {
5868           /* See the comment above the previous call of make_gap.  */
5869           int add = len_byte + inserted_byte;
5870
5871           GAP_SIZE -= add;
5872           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5873           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5874           make_gap (require + 2000);
5875           GAP_SIZE += add;
5876           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5877           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5878         }
5879     }
5880   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5881
5882   if (encodep && coding->dst_multibyte)
5883     {
5884       /* The output is unibyte.  We must convert 8-bit characters to
5885          multibyte form.  */
5886       if (inserted_byte * 2 > GAP_SIZE)
5887         {
5888           GAP_SIZE -= inserted_byte;
5889           ZV += inserted_byte; Z += inserted_byte;
5890           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5891           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5892           make_gap (inserted_byte - GAP_SIZE);
5893           GAP_SIZE += inserted_byte;
5894           ZV -= inserted_byte; Z -= inserted_byte;
5895           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5896           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5897         }
5898       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5899     }
5900
5901   /* If we shrank the conversion area, adjust it now.  */
5902   if (total_skip > 0)
5903     {
5904       if (tail_skip > 0)
5905         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5906       inserted += total_skip; inserted_byte += total_skip;
5907       GAP_SIZE += total_skip;
5908       GPT -= head_skip; GPT_BYTE -= head_skip;
5909       ZV -= total_skip; ZV_BYTE -= total_skip;
5910       Z -= total_skip; Z_BYTE -= total_skip;
5911       from -= head_skip; from_byte -= head_skip;
5912       to += tail_skip; to_byte += tail_skip;
5913     }
5914
5915   prev_Z = Z;
5916   if (! EQ (current_buffer->undo_list, Qt))
5917     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5918   else
5919     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5920                                  inserted, inserted_byte);
5921   inserted = Z - prev_Z;
5922
5923   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5924     coding_restore_composition (coding, Fcurrent_buffer ());
5925   coding_free_composition_data (coding);
5926
5927   if (! inhibit_pre_post_conversion
5928       && ! encodep && ! NILP (coding->post_read_conversion))
5929     {
5930       Lisp_Object val;
5931       Lisp_Object saved_coding_system;
5932
5933       if (from != PT)
5934         TEMP_SET_PT_BOTH (from, from_byte);
5935       prev_Z = Z;
5936       record_unwind_protect (code_convert_region_unwind,
5937                              Vlast_coding_system_used);
5938       saved_coding_system = Vlast_coding_system_used;
5939       Vlast_coding_system_used = coding->symbol;
5940       /* We should not call any more pre-write/post-read-conversion
5941          functions while this post-read-conversion is running.  */
5942       inhibit_pre_post_conversion = 1;
5943       val = call1 (coding->post_read_conversion, make_number (inserted));
5944       inhibit_pre_post_conversion = 0;
5945       coding->symbol = Vlast_coding_system_used;
5946       Vlast_coding_system_used = saved_coding_system;
5947       /* Discard the unwind protect.  */
5948       specpdl_ptr--;
5949       CHECK_NUMBER (val);
5950       inserted += Z - prev_Z;
5951     }
5952
5953   if (orig_point >= from)
5954     {
5955       if (orig_point >= from + orig_len)
5956         orig_point += inserted - orig_len;
5957       else
5958         orig_point = from;
5959       TEMP_SET_PT (orig_point);
5960     }
5961
5962   if (replace)
5963     {
5964       signal_after_change (from, to - from, inserted);
5965       update_compositions (from, from + inserted, CHECK_BORDER);
5966     }
5967
5968   {
5969     coding->consumed = to_byte - from_byte;
5970     coding->consumed_char = to - from;
5971     coding->produced = inserted_byte;
5972     coding->produced_char = inserted;
5973   }
5974
5975   return 0;
5976 }
5977
5978 Lisp_Object
5979 run_pre_post_conversion_on_str (str, coding, encodep)
5980      Lisp_Object str;
5981      struct coding_system *coding;
5982      int encodep;
5983 {
5984   int count = SPECPDL_INDEX ();
5985   struct gcpro gcpro1, gcpro2;
5986   int multibyte = STRING_MULTIBYTE (str);
5987   Lisp_Object buffer;
5988   struct buffer *buf;
5989   Lisp_Object old_deactivate_mark;
5990
5991   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5992   record_unwind_protect (code_convert_region_unwind,
5993                          Vlast_coding_system_used);
5994   /* It is not crucial to specbind this.  */
5995   old_deactivate_mark = Vdeactivate_mark;
5996   GCPRO2 (str, old_deactivate_mark);
5997
5998   buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5999   buf = XBUFFER (buffer);
6000
6001   buf->directory = current_buffer->directory;
6002   buf->read_only = Qnil;
6003   buf->filename = Qnil;
6004   buf->undo_list = Qt;
6005   buf->overlays_before = NULL;
6006   buf->overlays_after = NULL;
6007
6008   set_buffer_internal (buf);
6009   /* We must insert the contents of STR as is without
6010      unibyte<->multibyte conversion.  For that, we adjust the
6011      multibyteness of the working buffer to that of STR.  */
6012   Ferase_buffer ();
6013   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6014
6015   insert_from_string (str, 0, 0,
6016                       SCHARS (str), SBYTES (str), 0);
6017   UNGCPRO;
6018   inhibit_pre_post_conversion = 1;
6019   if (encodep)
6020     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6021   else
6022     {
6023       Vlast_coding_system_used = coding->symbol;
6024       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6025       call1 (coding->post_read_conversion, make_number (Z - BEG));
6026       coding->symbol = Vlast_coding_system_used;
6027     }
6028   inhibit_pre_post_conversion = 0;
6029   Vdeactivate_mark = old_deactivate_mark;
6030   str = make_buffer_string (BEG, Z, 1);
6031   return unbind_to (count, str);
6032 }
6033
6034 Lisp_Object
6035 decode_coding_string (str, coding, nocopy)
6036      Lisp_Object str;
6037      struct coding_system *coding;
6038      int nocopy;
6039 {
6040   int len;
6041   struct conversion_buffer buf;
6042   int from, to_byte;
6043   Lisp_Object saved_coding_symbol;
6044   int result;
6045   int require_decoding;
6046   int shrinked_bytes = 0;
6047   Lisp_Object newstr;
6048   int consumed, consumed_char, produced, produced_char;
6049
6050   from = 0;
6051   to_byte = SBYTES (str);
6052
6053   saved_coding_symbol = coding->symbol;
6054   coding->src_multibyte = STRING_MULTIBYTE (str);
6055   coding->dst_multibyte = 1;
6056   if (CODING_REQUIRE_DETECTION (coding))
6057     {
6058       /* See the comments in code_convert_region.  */
6059       if (coding->type == coding_type_undecided)
6060         {
6061           detect_coding (coding, SDATA (str), to_byte);
6062           if (coding->type == coding_type_undecided)
6063             {
6064               coding->type = coding_type_emacs_mule;
6065               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6066               /* As emacs-mule decoder will handle composition, we
6067                  need this setting to allocate coding->cmp_data
6068                  later.  */
6069               coding->composing = COMPOSITION_NO;
6070             }
6071         }
6072       if (coding->eol_type == CODING_EOL_UNDECIDED
6073           && coding->type != coding_type_ccl)
6074         {
6075           saved_coding_symbol = coding->symbol;
6076           detect_eol (coding, SDATA (str), to_byte);
6077           if (coding->eol_type == CODING_EOL_UNDECIDED)
6078             coding->eol_type = CODING_EOL_LF;
6079           /* We had better recover the original eol format if we
6080              encounter an inconsistent eol format while decoding.  */
6081           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6082         }
6083     }
6084
6085   if (coding->type == coding_type_no_conversion
6086       || coding->type == coding_type_raw_text)
6087     coding->dst_multibyte = 0;
6088
6089   require_decoding = CODING_REQUIRE_DECODING (coding);
6090
6091   if (STRING_MULTIBYTE (str))
6092     {
6093       /* Decoding routines expect the source text to be unibyte.  */
6094       str = Fstring_as_unibyte (str);
6095       to_byte = SBYTES (str);
6096       nocopy = 1;
6097       coding->src_multibyte = 0;
6098     }
6099
6100   /* Try to skip the heading and tailing ASCIIs.  */
6101   if (require_decoding && coding->type != coding_type_ccl)
6102     {
6103       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6104                                 0);
6105       if (from == to_byte)
6106         require_decoding = 0;
6107       shrinked_bytes = from + (SBYTES (str) - to_byte);
6108     }
6109
6110   if (!require_decoding
6111       && !(SYMBOLP (coding->post_read_conversion)
6112            && !NILP (Ffboundp (coding->post_read_conversion))))
6113     {
6114       coding->consumed = SBYTES (str);
6115       coding->consumed_char = SCHARS (str);
6116       if (coding->dst_multibyte)
6117         {
6118           str = Fstring_as_multibyte (str);
6119           nocopy = 1;
6120         }
6121       coding->produced = SBYTES (str);
6122       coding->produced_char = SCHARS (str);
6123       return (nocopy ? str : Fcopy_sequence (str));
6124     }
6125
6126   if (coding->composing != COMPOSITION_DISABLED)
6127     coding_allocate_composition_data (coding, from);
6128   len = decoding_buffer_size (coding, to_byte - from);
6129   allocate_conversion_buffer (buf, len);
6130
6131   consumed = consumed_char = produced = produced_char = 0;
6132   while (1)
6133     {
6134       result = decode_coding (coding, SDATA (str) + from + consumed,
6135                               buf.data + produced, to_byte - from - consumed,
6136                               buf.size - produced);
6137       consumed += coding->consumed;
6138       consumed_char += coding->consumed_char;
6139       produced += coding->produced;
6140       produced_char += coding->produced_char;
6141       if (result == CODING_FINISH_NORMAL
6142           || (result == CODING_FINISH_INSUFFICIENT_SRC
6143               && coding->consumed == 0))
6144         break;
6145       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6146         coding_allocate_composition_data (coding, from + produced_char);
6147       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6148         extend_conversion_buffer (&buf);
6149       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6150         {
6151           Lisp_Object eol_type;
6152
6153           /* Recover the original EOL format.  */
6154           if (coding->eol_type == CODING_EOL_CR)
6155             {
6156               unsigned char *p;
6157               for (p = buf.data; p < buf.data + produced; p++)
6158                 if (*p == '\n') *p = '\r';
6159             }
6160           else if (coding->eol_type == CODING_EOL_CRLF)
6161             {
6162               int num_eol = 0;
6163               unsigned char *p0, *p1;
6164               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6165                 if (*p0 == '\n') num_eol++;
6166               if (produced + num_eol >= buf.size)
6167                 extend_conversion_buffer (&buf);
6168               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6169                 {
6170                   *--p1 = *--p0;
6171                   if (*p0 == '\n') *--p1 = '\r';
6172                 }
6173               produced += num_eol;
6174               produced_char += num_eol;
6175             }
6176           /* Suppress eol-format conversion in the further conversion.  */
6177           coding->eol_type = CODING_EOL_LF;
6178
6179           /* Set the coding system symbol to that for Unix-like EOL.  */
6180           eol_type = Fget (saved_coding_symbol, Qeol_type);
6181           if (VECTORP (eol_type)
6182               && XVECTOR (eol_type)->size == 3
6183               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6184             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6185           else
6186             coding->symbol = saved_coding_symbol;
6187
6188
6189         }
6190     }
6191
6192   coding->consumed = consumed;
6193   coding->consumed_char = consumed_char;
6194   coding->produced = produced;
6195   coding->produced_char = produced_char;
6196
6197   if (coding->dst_multibyte)
6198     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6199                                            produced + shrinked_bytes);
6200   else
6201     newstr = make_uninit_string (produced + shrinked_bytes);
6202   if (from > 0)
6203     STRING_COPYIN (newstr, 0, SDATA (str), from);
6204   STRING_COPYIN (newstr, from, buf.data, produced);
6205   if (shrinked_bytes > from)
6206     STRING_COPYIN (newstr, from + produced,
6207                    SDATA (str) + to_byte,
6208                    shrinked_bytes - from);
6209   free_conversion_buffer (&buf);
6210
6211   if (coding->cmp_data && coding->cmp_data->used)
6212     coding_restore_composition (coding, newstr);
6213   coding_free_composition_data (coding);
6214
6215   if (SYMBOLP (coding->post_read_conversion)
6216       && !NILP (Ffboundp (coding->post_read_conversion)))
6217     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6218
6219   return newstr;
6220 }
6221
6222 Lisp_Object
6223 encode_coding_string (str, coding, nocopy)
6224      Lisp_Object str;
6225      struct coding_system *coding;
6226      int nocopy;
6227 {
6228   int len;
6229   struct conversion_buffer buf;
6230   int from, to, to_byte;
6231   int result;
6232   int shrinked_bytes = 0;
6233   Lisp_Object newstr;
6234   int consumed, consumed_char, produced, produced_char;
6235
6236   if (SYMBOLP (coding->pre_write_conversion)
6237       && !NILP (Ffboundp (coding->pre_write_conversion)))
6238     str = run_pre_post_conversion_on_str (str, coding, 1);
6239
6240   from = 0;
6241   to = SCHARS (str);
6242   to_byte = SBYTES (str);
6243
6244   /* Encoding routines determine the multibyteness of the source text
6245      by coding->src_multibyte.  */
6246   coding->src_multibyte = STRING_MULTIBYTE (str);
6247   coding->dst_multibyte = 0;
6248   if (! CODING_REQUIRE_ENCODING (coding))
6249     {
6250       coding->consumed = SBYTES (str);
6251       coding->consumed_char = SCHARS (str);
6252       if (STRING_MULTIBYTE (str))
6253         {
6254           str = Fstring_as_unibyte (str);
6255           nocopy = 1;
6256         }
6257       coding->produced = SBYTES (str);
6258       coding->produced_char = SCHARS (str);
6259       return (nocopy ? str : Fcopy_sequence (str));
6260     }
6261
6262   if (coding->composing != COMPOSITION_DISABLED)
6263     coding_save_composition (coding, from, to, str);
6264
6265   /* Try to skip the heading and tailing ASCIIs.  */
6266   if (coding->type != coding_type_ccl)
6267     {
6268       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6269                                 1);
6270       if (from == to_byte)
6271         return (nocopy ? str : Fcopy_sequence (str));
6272       shrinked_bytes = from + (SBYTES (str) - to_byte);
6273     }
6274
6275   len = encoding_buffer_size (coding, to_byte - from);
6276   allocate_conversion_buffer (buf, len);
6277
6278   consumed = consumed_char = produced = produced_char = 0;
6279   while (1)
6280     {
6281       result = encode_coding (coding, SDATA (str) + from + consumed,
6282                               buf.data + produced, to_byte - from - consumed,
6283                               buf.size - produced);
6284       consumed += coding->consumed;
6285       consumed_char += coding->consumed_char;
6286       produced += coding->produced;
6287       produced_char += coding->produced_char;
6288       if (result == CODING_FINISH_NORMAL
6289           || (result == CODING_FINISH_INSUFFICIENT_SRC
6290               && coding->consumed == 0))
6291         break;
6292       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6293       extend_conversion_buffer (&buf);
6294     }
6295
6296   coding->consumed = consumed;
6297   coding->consumed_char = consumed_char;
6298   coding->produced = produced;
6299   coding->produced_char = produced_char;
6300
6301   newstr = make_uninit_string (produced + shrinked_bytes);
6302   if (from > 0)
6303     STRING_COPYIN (newstr, 0, SDATA (str), from);
6304   STRING_COPYIN (newstr, from, buf.data, produced);
6305   if (shrinked_bytes > from)
6306     STRING_COPYIN (newstr, from + produced,
6307                    SDATA (str) + to_byte,
6308                    shrinked_bytes - from);
6309
6310   free_conversion_buffer (&buf);
6311   coding_free_composition_data (coding);
6312
6313   return newstr;
6314 }
6315
6316 \f
6317 #ifdef emacs
6318 /*** 8. Emacs Lisp library functions ***/
6319
6320 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6321        doc: /* Return t if OBJECT is nil or a coding-system.
6322 See the documentation of `make-coding-system' for information
6323 about coding-system objects.  */)
6324      (obj)
6325      Lisp_Object obj;
6326 {
6327   if (NILP (obj))
6328     return Qt;
6329   if (!SYMBOLP (obj))
6330     return Qnil;
6331   /* Get coding-spec vector for OBJ.  */
6332   obj = Fget (obj, Qcoding_system);
6333   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6334           ? Qt : Qnil);
6335 }
6336
6337 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6338        Sread_non_nil_coding_system, 1, 1, 0,
6339        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6340      (prompt)
6341      Lisp_Object prompt;
6342 {
6343   Lisp_Object val;
6344   do
6345     {
6346       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6347                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6348     }
6349   while (SCHARS (val) == 0);
6350   return (Fintern (val, Qnil));
6351 }
6352
6353 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6354        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6355 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6356      (prompt, default_coding_system)
6357      Lisp_Object prompt, default_coding_system;
6358 {
6359   Lisp_Object val;
6360   if (SYMBOLP (default_coding_system))
6361     default_coding_system = SYMBOL_NAME (default_coding_system);
6362   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6363                           Qt, Qnil, Qcoding_system_history,
6364                           default_coding_system, Qnil);
6365   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6366 }
6367
6368 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6369        1, 1, 0,
6370        doc: /* Check validity of CODING-SYSTEM.
6371 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6372 It is valid if it is a symbol with a non-nil `coding-system' property.
6373 The value of property should be a vector of length 5.  */)
6374      (coding_system)
6375      Lisp_Object coding_system;
6376 {
6377   CHECK_SYMBOL (coding_system);
6378   if (!NILP (Fcoding_system_p (coding_system)))
6379     return coding_system;
6380   while (1)
6381     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6382 }
6383 \f
6384 Lisp_Object
6385 detect_coding_system (src, src_bytes, highest, multibytep)
6386      const unsigned char *src;
6387      int src_bytes, highest;
6388      int multibytep;
6389 {
6390   int coding_mask, eol_type;
6391   Lisp_Object val, tmp;
6392   int dummy;
6393
6394   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6395   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6396   if (eol_type == CODING_EOL_INCONSISTENT)
6397     eol_type = CODING_EOL_UNDECIDED;
6398
6399   if (!coding_mask)
6400     {
6401       val = Qundecided;
6402       if (eol_type != CODING_EOL_UNDECIDED)
6403         {
6404           Lisp_Object val2;
6405           val2 = Fget (Qundecided, Qeol_type);
6406           if (VECTORP (val2))
6407             val = XVECTOR (val2)->contents[eol_type];
6408         }
6409       return (highest ? val : Fcons (val, Qnil));
6410     }
6411
6412   /* At first, gather possible coding systems in VAL.  */
6413   val = Qnil;
6414   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6415     {
6416       Lisp_Object category_val, category_index;
6417
6418       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6419       category_val = Fsymbol_value (XCAR (tmp));
6420       if (!NILP (category_val)
6421           && NATNUMP (category_index)
6422           && (coding_mask & (1 << XFASTINT (category_index))))
6423         {
6424           val = Fcons (category_val, val);
6425           if (highest)
6426             break;
6427         }
6428     }
6429   if (!highest)
6430     val = Fnreverse (val);
6431
6432   /* Then, replace the elements with subsidiary coding systems.  */
6433   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6434     {
6435       if (eol_type != CODING_EOL_UNDECIDED
6436           && eol_type != CODING_EOL_INCONSISTENT)
6437         {
6438           Lisp_Object eol;
6439           eol = Fget (XCAR (tmp), Qeol_type);
6440           if (VECTORP (eol))
6441             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6442         }
6443     }
6444   return (highest ? XCAR (val) : val);
6445 }
6446
6447 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6448        2, 3, 0,
6449        doc: /* Detect how the byte sequence in the region is encoded.
6450 Return a list of possible coding systems used on decoding a byte
6451 sequence containing the bytes in the region between START and END when
6452 the coding system `undecided' is specified.  The list is ordered by
6453 priority decided in the current language environment.
6454
6455 If only ASCII characters are found, it returns a list of single element
6456 `undecided' or its subsidiary coding system according to a detected
6457 end-of-line format.
6458
6459 If optional argument HIGHEST is non-nil, return the coding system of
6460 highest priority.  */)
6461      (start, end, highest)
6462      Lisp_Object start, end, highest;
6463 {
6464   int from, to;
6465   int from_byte, to_byte;
6466   int include_anchor_byte = 0;
6467
6468   CHECK_NUMBER_COERCE_MARKER (start);
6469   CHECK_NUMBER_COERCE_MARKER (end);
6470
6471   validate_region (&start, &end);
6472   from = XINT (start), to = XINT (end);
6473   from_byte = CHAR_TO_BYTE (from);
6474   to_byte = CHAR_TO_BYTE (to);
6475
6476   if (from < GPT && to >= GPT)
6477     move_gap_both (to, to_byte);
6478   /* If we an anchor byte `\0' follows the region, we include it in
6479      the detecting source.  Then code detectors can handle the tailing
6480      byte sequence more accurately.
6481
6482      Fix me: This is not a perfect solution.  It is better that we
6483      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6484   */
6485   if (to == Z || (to == GPT && GAP_SIZE > 0))
6486     include_anchor_byte = 1;
6487   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6488                                to_byte - from_byte + include_anchor_byte,
6489                                !NILP (highest),
6490                                !NILP (current_buffer
6491                                       ->enable_multibyte_characters));
6492 }
6493
6494 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6495        1, 2, 0,
6496        doc: /* Detect how the byte sequence in STRING is encoded.
6497 Return a list of possible coding systems used on decoding a byte
6498 sequence containing the bytes in STRING when the coding system
6499 `undecided' is specified.  The list is ordered by priority decided in
6500 the current language environment.
6501
6502 If only ASCII characters are found, it returns a list of single element
6503 `undecided' or its subsidiary coding system according to a detected
6504 end-of-line format.
6505
6506 If optional argument HIGHEST is non-nil, return the coding system of
6507 highest priority.  */)
6508      (string, highest)
6509      Lisp_Object string, highest;
6510 {
6511   CHECK_STRING (string);
6512
6513   return detect_coding_system (SDATA (string),
6514                                /* "+ 1" is to include the anchor byte
6515                                   `\0'.  With this, code detectors can
6516                                   handle the tailing bytes more
6517                                   accurately.  */
6518                                SBYTES (string) + 1,
6519                                !NILP (highest),
6520                                STRING_MULTIBYTE (string));
6521 }
6522
6523 /*  Subroutine for Fsafe_coding_systems_region_internal.
6524
6525     Return a list of coding systems that safely encode the multibyte
6526     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6527     possible coding systems.  If it is nil, it means that we have not
6528     yet found any coding systems.
6529
6530     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
6531     element of WORK_TABLE is set to t once the element is looked up.
6532
6533     If a non-ASCII single byte char is found, set
6534     *single_byte_char_found to 1.  */
6535
6536 static Lisp_Object
6537 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6538      unsigned char *p, *pend;
6539      Lisp_Object safe_codings, work_table;
6540      int *single_byte_char_found;
6541 {
6542   int c, len;
6543   Lisp_Object val, ch;
6544   Lisp_Object prev, tail;
6545
6546   while (p < pend)
6547     {
6548       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6549       p += len;
6550       if (ASCII_BYTE_P (c))
6551         /* We can ignore ASCII characters here.  */
6552         continue;
6553       if (SINGLE_BYTE_CHAR_P (c))
6554         *single_byte_char_found = 1;
6555       if (NILP (safe_codings))
6556         /* Already all coding systems are excluded.  But, we can't
6557            terminate the loop here because non-ASCII single-byte char
6558            must be found.  */
6559         continue;
6560       /* Check the safe coding systems for C.  */
6561       ch = make_number (c);
6562       val = Faref (work_table, ch);
6563       if (EQ (val, Qt))
6564         /* This element was already checked.  Ignore it.  */
6565         continue;
6566       /* Remember that we checked this element.  */
6567       Faset (work_table, ch, Qt);
6568
6569       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6570         {
6571           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6572           int encodable;
6573
6574           elt = XCAR (tail);
6575           if (CONSP (XCDR (elt)))
6576             {
6577               /* This entry has this format now:
6578                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6579                           ACCEPT-LATIN-EXTRA ) */
6580               val = XCDR (elt);
6581               encodable = ! NILP (Faref (XCAR (val), ch));
6582               if (! encodable)
6583                 {
6584                   val = XCDR (val);
6585                   translation_table = XCAR (val);
6586                   hash_table = XCAR (XCDR (val));
6587                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6588                 }
6589             }
6590           else
6591             {
6592               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6593               encodable = ! NILP (Faref (XCDR (elt), ch));
6594               if (! encodable)
6595                 {
6596                   /* Transform the format to:
6597                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6598                        ACCEPT-LATIN-EXTRA )  */
6599                   val = Fget (XCAR (elt), Qcoding_system);
6600                   translation_table
6601                     = Fplist_get (AREF (val, 3),
6602                                   Qtranslation_table_for_encode);
6603                   if (SYMBOLP (translation_table))
6604                     translation_table = Fget (translation_table,
6605                                               Qtranslation_table);
6606                   hash_table
6607                     = (CHAR_TABLE_P (translation_table)
6608                        ? XCHAR_TABLE (translation_table)->extras[1]
6609                        : Qnil);
6610                   accept_latin_extra
6611                     = ((EQ (AREF (val, 0), make_number (2))
6612                         && VECTORP (AREF (val, 4)))
6613                        ? AREF (AREF (val, 4), 16)
6614                        : Qnil);
6615                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6616                                         translation_table, hash_table,
6617                                         accept_latin_extra));
6618                 }
6619             }
6620
6621           if (! encodable
6622               && ((CHAR_TABLE_P (translation_table)
6623                    && ! NILP (Faref (translation_table, ch)))
6624                   || (HASH_TABLE_P (hash_table)
6625                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6626                   || (SINGLE_BYTE_CHAR_P (c)
6627                       && ! NILP (accept_latin_extra)
6628                       && VECTORP (Vlatin_extra_code_table)
6629                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6630             encodable = 1;
6631           if (encodable)
6632             prev = tail;
6633           else
6634             {
6635               /* Exclude this coding system from SAFE_CODINGS.  */
6636               if (EQ (tail, safe_codings))
6637                 safe_codings = XCDR (safe_codings);
6638               else
6639                 XSETCDR (prev, XCDR (tail));
6640             }
6641         }
6642     }
6643   return safe_codings;
6644 }
6645
6646 DEFUN ("find-coding-systems-region-internal",
6647        Ffind_coding_systems_region_internal,
6648        Sfind_coding_systems_region_internal, 2, 2, 0,
6649        doc: /* Internal use only.  */)
6650      (start, end)
6651      Lisp_Object start, end;
6652 {
6653   Lisp_Object work_table, safe_codings;
6654   int non_ascii_p = 0;
6655   int single_byte_char_found = 0;
6656   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6657
6658   if (STRINGP (start))
6659     {
6660       if (!STRING_MULTIBYTE (start))
6661         return Qt;
6662       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6663       p2 = p2end = p1end;
6664       if (SCHARS (start) != SBYTES (start))
6665         non_ascii_p = 1;
6666     }
6667   else
6668     {
6669       int from, to, stop;
6670
6671       CHECK_NUMBER_COERCE_MARKER (start);
6672       CHECK_NUMBER_COERCE_MARKER (end);
6673       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6674         args_out_of_range (start, end);
6675       if (NILP (current_buffer->enable_multibyte_characters))
6676         return Qt;
6677       from = CHAR_TO_BYTE (XINT (start));
6678       to = CHAR_TO_BYTE (XINT (end));
6679       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6680       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6681       if (stop == to)
6682         p2 = p2end = p1end;
6683       else
6684         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6685       if (XINT (end) - XINT (start) != to - from)
6686         non_ascii_p = 1;
6687     }
6688
6689   if (!non_ascii_p)
6690     {
6691       /* We are sure that the text contains no multibyte character.
6692          Check if it contains eight-bit-graphic.  */
6693       p = p1;
6694       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6695       if (p == p1end)
6696         {
6697           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6698           if (p == p2end)
6699             return Qt;
6700         }
6701     }
6702
6703   /* The text contains non-ASCII characters.  */
6704
6705   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6706   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6707
6708   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6709                                     &single_byte_char_found);
6710   if (p2 < p2end)
6711     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6712                                       &single_byte_char_found);
6713   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6714     safe_codings = Qt;
6715   else
6716     {
6717       /* Turn safe_codings to a list of coding systems... */
6718       Lisp_Object val;
6719
6720       if (single_byte_char_found)
6721         /* ... and append these for eight-bit chars.  */
6722         val = Fcons (Qraw_text,
6723                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6724       else
6725         /* ... and append generic coding systems.  */
6726         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6727
6728       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6729         val = Fcons (XCAR (XCAR (safe_codings)), val);
6730       safe_codings = val;
6731     }
6732
6733   return safe_codings;
6734 }
6735
6736
6737 /* Search from position POS for such characters that are unencodable
6738    accoding to SAFE_CHARS, and return a list of their positions.  P
6739    points where in the memory the character at POS exists.  Limit the
6740    search at PEND or when Nth unencodable characters are found.
6741
6742    If SAFE_CHARS is a char table, an element for an unencodable
6743    character is nil.
6744
6745    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6746
6747    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6748    eight-bit-graphic characters are unencodable.  */
6749
6750 static Lisp_Object
6751 unencodable_char_position (safe_chars, pos, p, pend, n)
6752      Lisp_Object safe_chars;
6753      int pos;
6754      unsigned char *p, *pend;
6755      int n;
6756 {
6757   Lisp_Object pos_list;
6758
6759   pos_list = Qnil;
6760   while (p < pend)
6761     {
6762       int len;
6763       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6764
6765       if (c >= 128
6766           && (CHAR_TABLE_P (safe_chars)
6767               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6768               : (NILP (safe_chars) || c < 256)))
6769         {
6770           pos_list = Fcons (make_number (pos), pos_list);
6771           if (--n <= 0)
6772             break;
6773         }
6774       pos++;
6775       p += len;
6776     }
6777   return Fnreverse (pos_list);
6778 }
6779
6780
6781 DEFUN ("unencodable-char-position", Funencodable_char_position,
6782        Sunencodable_char_position, 3, 5, 0,
6783        doc: /*
6784 Return position of first un-encodable character in a region.
6785 START and END specfiy the region and CODING-SYSTEM specifies the
6786 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6787
6788 If optional 4th argument COUNT is non-nil, it specifies at most how
6789 many un-encodable characters to search.  In this case, the value is a
6790 list of positions.
6791
6792 If optional 5th argument STRING is non-nil, it is a string to search
6793 for un-encodable characters.  In that case, START and END are indexes
6794 to the string.  */)
6795      (start, end, coding_system, count, string)
6796      Lisp_Object start, end, coding_system, count, string;
6797 {
6798   int n;
6799   Lisp_Object safe_chars;
6800   struct coding_system coding;
6801   Lisp_Object positions;
6802   int from, to;
6803   unsigned char *p, *pend;
6804
6805   if (NILP (string))
6806     {
6807       validate_region (&start, &end);
6808       from = XINT (start);
6809       to = XINT (end);
6810       if (NILP (current_buffer->enable_multibyte_characters))
6811         return Qnil;
6812       p = CHAR_POS_ADDR (from);
6813       if (to == GPT)
6814         pend = GPT_ADDR;
6815       else
6816         pend = CHAR_POS_ADDR (to);
6817     }
6818   else
6819     {
6820       CHECK_STRING (string);
6821       CHECK_NATNUM (start);
6822       CHECK_NATNUM (end);
6823       from = XINT (start);
6824       to = XINT (end);
6825       if (from > to
6826           || to > SCHARS (string))
6827         args_out_of_range_3 (string, start, end);
6828       if (! STRING_MULTIBYTE (string))
6829         return Qnil;
6830       p = SDATA (string) + string_char_to_byte (string, from);
6831       pend = SDATA (string) + string_char_to_byte (string, to);
6832     }
6833
6834   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6835
6836   if (NILP (count))
6837     n = 1;
6838   else
6839     {
6840       CHECK_NATNUM (count);
6841       n = XINT (count);
6842     }
6843
6844   if (coding.type == coding_type_no_conversion
6845       || coding.type == coding_type_raw_text)
6846     return Qnil;
6847
6848   if (coding.type == coding_type_undecided)
6849     safe_chars = Qnil;
6850   else
6851     safe_chars = coding_safe_chars (coding_system);
6852
6853   if (STRINGP (string)
6854       || from >= GPT || to <= GPT)
6855     positions = unencodable_char_position (safe_chars, from, p, pend, n);
6856   else
6857     {
6858       Lisp_Object args[2];
6859
6860       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6861       n -= XINT (Flength (args[0]));
6862       if (n <= 0)
6863         positions = args[0];
6864       else
6865         {
6866           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6867                                                pend, n);
6868           positions = Fappend (2, args);
6869         }
6870     }
6871
6872   return  (NILP (count) ? Fcar (positions) : positions);
6873 }
6874
6875
6876 Lisp_Object
6877 code_convert_region1 (start, end, coding_system, encodep)
6878      Lisp_Object start, end, coding_system;
6879      int encodep;
6880 {
6881   struct coding_system coding;
6882   int from, to;
6883
6884   CHECK_NUMBER_COERCE_MARKER (start);
6885   CHECK_NUMBER_COERCE_MARKER (end);
6886   CHECK_SYMBOL (coding_system);
6887
6888   validate_region (&start, &end);
6889   from = XFASTINT (start);
6890   to = XFASTINT (end);
6891
6892   if (NILP (coding_system))
6893     return make_number (to - from);
6894
6895   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6896     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6897
6898   coding.mode |= CODING_MODE_LAST_BLOCK;
6899   coding.src_multibyte = coding.dst_multibyte
6900     = !NILP (current_buffer->enable_multibyte_characters);
6901   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6902                        &coding, encodep, 1);
6903   Vlast_coding_system_used = coding.symbol;
6904   return make_number (coding.produced_char);
6905 }
6906
6907 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6908        3, 3, "r\nzCoding system: ",
6909        doc: /* Decode the current region from the specified coding system.
6910 When called from a program, takes three arguments:
6911 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6912 This function sets `last-coding-system-used' to the precise coding system
6913 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6914 not fully specified.)
6915 It returns the length of the decoded text.  */)
6916      (start, end, coding_system)
6917      Lisp_Object start, end, coding_system;
6918 {
6919   return code_convert_region1 (start, end, coding_system, 0);
6920 }
6921
6922 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6923        3, 3, "r\nzCoding system: ",
6924        doc: /* Encode the current region into the specified coding system.
6925 When called from a program, takes three arguments:
6926 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6927 This function sets `last-coding-system-used' to the precise coding system
6928 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6929 not fully specified.)
6930 It returns the length of the encoded text.  */)
6931      (start, end, coding_system)
6932      Lisp_Object start, end, coding_system;
6933 {
6934   return code_convert_region1 (start, end, coding_system, 1);
6935 }
6936
6937 Lisp_Object
6938 code_convert_string1 (string, coding_system, nocopy, encodep)
6939      Lisp_Object string, coding_system, nocopy;
6940      int encodep;
6941 {
6942   struct coding_system coding;
6943
6944   CHECK_STRING (string);
6945   CHECK_SYMBOL (coding_system);
6946
6947   if (NILP (coding_system))
6948     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6949
6950   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6951     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6952
6953   coding.mode |= CODING_MODE_LAST_BLOCK;
6954   string = (encodep
6955             ? encode_coding_string (string, &coding, !NILP (nocopy))
6956             : decode_coding_string (string, &coding, !NILP (nocopy)));
6957   Vlast_coding_system_used = coding.symbol;
6958
6959   return string;
6960 }
6961
6962 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6963        2, 3, 0,
6964        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6965 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6966 if the decoding operation is trivial.
6967 This function sets `last-coding-system-used' to the precise coding system
6968 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6969 not fully specified.)  */)
6970      (string, coding_system, nocopy)
6971      Lisp_Object string, coding_system, nocopy;
6972 {
6973   return code_convert_string1 (string, coding_system, nocopy, 0);
6974 }
6975
6976 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6977        2, 3, 0,
6978        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6979 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6980 if the encoding operation is trivial.
6981 This function sets `last-coding-system-used' to the precise coding system
6982 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6983 not fully specified.)  */)
6984      (string, coding_system, nocopy)
6985      Lisp_Object string, coding_system, nocopy;
6986 {
6987   return code_convert_string1 (string, coding_system, nocopy, 1);
6988 }
6989
6990 /* Encode or decode STRING according to CODING_SYSTEM.
6991    Do not set Vlast_coding_system_used.
6992
6993    This function is called only from macros DECODE_FILE and
6994    ENCODE_FILE, thus we ignore character composition.  */
6995
6996 Lisp_Object
6997 code_convert_string_norecord (string, coding_system, encodep)
6998      Lisp_Object string, coding_system;
6999      int encodep;
7000 {
7001   struct coding_system coding;
7002
7003   CHECK_STRING (string);
7004   CHECK_SYMBOL (coding_system);
7005
7006   if (NILP (coding_system))
7007     return string;
7008
7009   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7010     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7011
7012   coding.composing = COMPOSITION_DISABLED;
7013   coding.mode |= CODING_MODE_LAST_BLOCK;
7014   return (encodep
7015           ? encode_coding_string (string, &coding, 1)
7016           : decode_coding_string (string, &coding, 1));
7017 }
7018 \f
7019 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7020        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7021 Return the corresponding character.  */)
7022      (code)
7023      Lisp_Object code;
7024 {
7025   unsigned char c1, c2, s1, s2;
7026   Lisp_Object val;
7027
7028   CHECK_NUMBER (code);
7029   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7030   if (s1 == 0)
7031     {
7032       if (s2 < 0x80)
7033         XSETFASTINT (val, s2);
7034       else if (s2 >= 0xA0 || s2 <= 0xDF)
7035         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7036       else
7037         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7038     }
7039   else
7040     {
7041       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7042           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7043         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7044       DECODE_SJIS (s1, s2, c1, c2);
7045       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7046     }
7047   return val;
7048 }
7049
7050 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7051        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7052 Return the corresponding code in SJIS.  */)
7053      (ch)
7054      Lisp_Object ch;
7055 {
7056   int charset, c1, c2, s1, s2;
7057   Lisp_Object val;
7058
7059   CHECK_NUMBER (ch);
7060   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7061   if (charset == CHARSET_ASCII)
7062     {
7063       val = ch;
7064     }
7065   else if (charset == charset_jisx0208
7066            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7067     {
7068       ENCODE_SJIS (c1, c2, s1, s2);
7069       XSETFASTINT (val, (s1 << 8) | s2);
7070     }
7071   else if (charset == charset_katakana_jisx0201
7072            && c1 > 0x20 && c2 < 0xE0)
7073     {
7074       XSETFASTINT (val, c1 | 0x80);
7075     }
7076   else
7077     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7078   return val;
7079 }
7080
7081 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7082        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7083 Return the corresponding character.  */)
7084      (code)
7085      Lisp_Object code;
7086 {
7087   int charset;
7088   unsigned char b1, b2, c1, c2;
7089   Lisp_Object val;
7090
7091   CHECK_NUMBER (code);
7092   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7093   if (b1 == 0)
7094     {
7095       if (b2 >= 0x80)
7096         error ("Invalid BIG5 code: %x", XFASTINT (code));
7097       val = code;
7098     }
7099   else
7100     {
7101       if ((b1 < 0xA1 || b1 > 0xFE)
7102           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7103         error ("Invalid BIG5 code: %x", XFASTINT (code));
7104       DECODE_BIG5 (b1, b2, charset, c1, c2);
7105       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7106     }
7107   return val;
7108 }
7109
7110 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7111        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7112 Return the corresponding character code in Big5.  */)
7113      (ch)
7114      Lisp_Object ch;
7115 {
7116   int charset, c1, c2, b1, b2;
7117   Lisp_Object val;
7118
7119   CHECK_NUMBER (ch);
7120   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7121   if (charset == CHARSET_ASCII)
7122     {
7123       val = ch;
7124     }
7125   else if ((charset == charset_big5_1
7126             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7127            || (charset == charset_big5_2
7128                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7129     {
7130       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7131       XSETFASTINT (val, (b1 << 8) | b2);
7132     }
7133   else
7134     error ("Can't encode to Big5: %d", XFASTINT (ch));
7135   return val;
7136 }
7137 \f
7138 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7139        Sset_terminal_coding_system_internal, 1, 1, 0,
7140        doc: /* Internal use only.  */)
7141      (coding_system)
7142      Lisp_Object coding_system;
7143 {
7144   CHECK_SYMBOL (coding_system);
7145   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7146   /* We had better not send unsafe characters to terminal.  */
7147   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7148   /* Character composition should be disabled.  */
7149   terminal_coding.composing = COMPOSITION_DISABLED;
7150   /* Error notification should be suppressed.  */
7151   terminal_coding.suppress_error = 1;
7152   terminal_coding.src_multibyte = 1;
7153   terminal_coding.dst_multibyte = 0;
7154   return Qnil;
7155 }
7156
7157 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7158        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7159        doc: /* Internal use only.  */)
7160      (coding_system)
7161      Lisp_Object coding_system;
7162 {
7163   CHECK_SYMBOL (coding_system);
7164   setup_coding_system (Fcheck_coding_system (coding_system),
7165                        &safe_terminal_coding);
7166   /* Character composition should be disabled.  */
7167   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7168   /* Error notification should be suppressed.  */
7169   terminal_coding.suppress_error = 1;
7170   safe_terminal_coding.src_multibyte = 1;
7171   safe_terminal_coding.dst_multibyte = 0;
7172   return Qnil;
7173 }
7174
7175 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7176        Sterminal_coding_system, 0, 0, 0,
7177        doc: /* Return coding system specified for terminal output.  */)
7178      ()
7179 {
7180   return terminal_coding.symbol;
7181 }
7182
7183 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7184        Sset_keyboard_coding_system_internal, 1, 1, 0,
7185        doc: /* Internal use only.  */)
7186      (coding_system)
7187      Lisp_Object coding_system;
7188 {
7189   CHECK_SYMBOL (coding_system);
7190   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7191   /* Character composition should be disabled.  */
7192   keyboard_coding.composing = COMPOSITION_DISABLED;
7193   return Qnil;
7194 }
7195
7196 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7197        Skeyboard_coding_system, 0, 0, 0,
7198        doc: /* Return coding system specified for decoding keyboard input.  */)
7199      ()
7200 {
7201   return keyboard_coding.symbol;
7202 }
7203
7204 \f
7205 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7206        Sfind_operation_coding_system,  1, MANY, 0,
7207        doc: /* Choose a coding system for an operation based on the target name.
7208 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7209 DECODING-SYSTEM is the coding system to use for decoding
7210 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7211 for encoding (in case OPERATION does encoding).
7212
7213 The first argument OPERATION specifies an I/O primitive:
7214   For file I/O, `insert-file-contents' or `write-region'.
7215   For process I/O, `call-process', `call-process-region', or `start-process'.
7216   For network I/O, `open-network-stream'.
7217
7218 The remaining arguments should be the same arguments that were passed
7219 to the primitive.  Depending on which primitive, one of those arguments
7220 is selected as the TARGET.  For example, if OPERATION does file I/O,
7221 whichever argument specifies the file name is TARGET.
7222
7223 TARGET has a meaning which depends on OPERATION:
7224   For file I/O, TARGET is a file name.
7225   For process I/O, TARGET is a process name.
7226   For network I/O, TARGET is a service name or a port number
7227
7228 This function looks up what specified for TARGET in,
7229 `file-coding-system-alist', `process-coding-system-alist',
7230 or `network-coding-system-alist' depending on OPERATION.
7231 They may specify a coding system, a cons of coding systems,
7232 or a function symbol to call.
7233 In the last case, we call the function with one argument,
7234 which is a list of all the arguments given to this function.
7235
7236 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7237      (nargs, args)
7238      int nargs;
7239      Lisp_Object *args;
7240 {
7241   Lisp_Object operation, target_idx, target, val;
7242   register Lisp_Object chain;
7243
7244   if (nargs < 2)
7245     error ("Too few arguments");
7246   operation = args[0];
7247   if (!SYMBOLP (operation)
7248       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7249     error ("Invalid first argument");
7250   if (nargs < 1 + XINT (target_idx))
7251     error ("Too few arguments for operation: %s",
7252            SDATA (SYMBOL_NAME (operation)));
7253   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7254      argument to write-region) is string, it must be treated as a
7255      target file name.  */
7256   if (EQ (operation, Qwrite_region)
7257       && nargs > 5
7258       && STRINGP (args[5]))
7259     target_idx = make_number (4);
7260   target = args[XINT (target_idx) + 1];
7261   if (!(STRINGP (target)
7262         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7263     error ("Invalid argument %d", XINT (target_idx) + 1);
7264
7265   chain = ((EQ (operation, Qinsert_file_contents)
7266             || EQ (operation, Qwrite_region))
7267            ? Vfile_coding_system_alist
7268            : (EQ (operation, Qopen_network_stream)
7269               ? Vnetwork_coding_system_alist
7270               : Vprocess_coding_system_alist));
7271   if (NILP (chain))
7272     return Qnil;
7273
7274   for (; CONSP (chain); chain = XCDR (chain))
7275     {
7276       Lisp_Object elt;
7277       elt = XCAR (chain);
7278
7279       if (CONSP (elt)
7280           && ((STRINGP (target)
7281                && STRINGP (XCAR (elt))
7282                && fast_string_match (XCAR (elt), target) >= 0)
7283               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7284         {
7285           val = XCDR (elt);
7286           /* Here, if VAL is both a valid coding system and a valid
7287              function symbol, we return VAL as a coding system.  */
7288           if (CONSP (val))
7289             return val;
7290           if (! SYMBOLP (val))
7291             return Qnil;
7292           if (! NILP (Fcoding_system_p (val)))
7293             return Fcons (val, val);
7294           if (! NILP (Ffboundp (val)))
7295             {
7296               val = call1 (val, Flist (nargs, args));
7297               if (CONSP (val))
7298                 return val;
7299               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7300                 return Fcons (val, val);
7301             }
7302           return Qnil;
7303         }
7304     }
7305   return Qnil;
7306 }
7307
7308 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7309        Supdate_coding_systems_internal, 0, 0, 0,
7310        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7311 When values of any coding categories are changed, you must
7312 call this function.  */)
7313      ()
7314 {
7315   int i;
7316
7317   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7318     {
7319       Lisp_Object val;
7320
7321       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7322       if (!NILP (val))
7323         {
7324           if (! coding_system_table[i])
7325             coding_system_table[i] = ((struct coding_system *)
7326                                       xmalloc (sizeof (struct coding_system)));
7327           setup_coding_system (val, coding_system_table[i]);
7328         }
7329       else if (coding_system_table[i])
7330         {
7331           xfree (coding_system_table[i]);
7332           coding_system_table[i] = NULL;
7333         }
7334     }
7335
7336   return Qnil;
7337 }
7338
7339 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7340        Sset_coding_priority_internal, 0, 0, 0,
7341        doc: /* Update internal database for the current value of `coding-category-list'.
7342 This function is internal use only.  */)
7343      ()
7344 {
7345   int i = 0, idx;
7346   Lisp_Object val;
7347
7348   val = Vcoding_category_list;
7349
7350   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7351     {
7352       if (! SYMBOLP (XCAR (val)))
7353         break;
7354       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7355       if (idx >= CODING_CATEGORY_IDX_MAX)
7356         break;
7357       coding_priorities[i++] = (1 << idx);
7358       val = XCDR (val);
7359     }
7360   /* If coding-category-list is valid and contains all coding
7361      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7362      the following code saves Emacs from crashing.  */
7363   while (i < CODING_CATEGORY_IDX_MAX)
7364     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7365
7366   return Qnil;
7367 }
7368
7369 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7370        Sdefine_coding_system_internal, 1, 1, 0,
7371        doc: /* Register CODING-SYSTEM as a base coding system.
7372 This function is internal use only.  */)
7373      (coding_system)
7374      Lisp_Object coding_system;
7375 {
7376   Lisp_Object safe_chars, slot;
7377
7378   if (NILP (Fcheck_coding_system (coding_system)))
7379     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7380   safe_chars = coding_safe_chars (coding_system);
7381   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7382     error ("No valid safe-chars property for %s",
7383            SDATA (SYMBOL_NAME (coding_system)));
7384   if (EQ (safe_chars, Qt))
7385     {
7386       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7387         XSETCAR (Vcoding_system_safe_chars,
7388                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7389     }
7390   else
7391     {
7392       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7393       if (NILP (slot))
7394         XSETCDR (Vcoding_system_safe_chars,
7395                  nconc2 (XCDR (Vcoding_system_safe_chars),
7396                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7397       else
7398         XSETCDR (slot, safe_chars);
7399     }
7400   return Qnil;
7401 }
7402
7403 #endif /* emacs */
7404
7405 \f
7406 /*** 9. Post-amble ***/
7407
7408 void
7409 init_coding_once ()
7410 {
7411   int i;
7412
7413   /* Emacs' internal format specific initialize routine.  */
7414   for (i = 0; i <= 0x20; i++)
7415     emacs_code_class[i] = EMACS_control_code;
7416   emacs_code_class[0x0A] = EMACS_linefeed_code;
7417   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7418   for (i = 0x21 ; i < 0x7F; i++)
7419     emacs_code_class[i] = EMACS_ascii_code;
7420   emacs_code_class[0x7F] = EMACS_control_code;
7421   for (i = 0x80; i < 0xFF; i++)
7422     emacs_code_class[i] = EMACS_invalid_code;
7423   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7424   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7425   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7426   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7427
7428   /* ISO2022 specific initialize routine.  */
7429   for (i = 0; i < 0x20; i++)
7430     iso_code_class[i] = ISO_control_0;
7431   for (i = 0x21; i < 0x7F; i++)
7432     iso_code_class[i] = ISO_graphic_plane_0;
7433   for (i = 0x80; i < 0xA0; i++)
7434     iso_code_class[i] = ISO_control_1;
7435   for (i = 0xA1; i < 0xFF; i++)
7436     iso_code_class[i] = ISO_graphic_plane_1;
7437   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7438   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7439   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7440   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7441   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7442   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7443   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7444   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7445   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7446   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7447
7448   setup_coding_system (Qnil, &keyboard_coding);
7449   setup_coding_system (Qnil, &terminal_coding);
7450   setup_coding_system (Qnil, &safe_terminal_coding);
7451   setup_coding_system (Qnil, &default_buffer_file_coding);
7452
7453   bzero (coding_system_table, sizeof coding_system_table);
7454
7455   bzero (ascii_skip_code, sizeof ascii_skip_code);
7456   for (i = 0; i < 128; i++)
7457     ascii_skip_code[i] = 1;
7458
7459 #if defined (MSDOS) || defined (WINDOWSNT)
7460   system_eol_type = CODING_EOL_CRLF;
7461 #else
7462   system_eol_type = CODING_EOL_LF;
7463 #endif
7464
7465   inhibit_pre_post_conversion = 0;
7466 }
7467
7468 #ifdef emacs
7469
7470 void
7471 syms_of_coding ()
7472 {
7473   Qtarget_idx = intern ("target-idx");
7474   staticpro (&Qtarget_idx);
7475
7476   Qcoding_system_history = intern ("coding-system-history");
7477   staticpro (&Qcoding_system_history);
7478   Fset (Qcoding_system_history, Qnil);
7479
7480   /* Target FILENAME is the first argument.  */
7481   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7482   /* Target FILENAME is the third argument.  */
7483   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7484
7485   Qcall_process = intern ("call-process");
7486   staticpro (&Qcall_process);
7487   /* Target PROGRAM is the first argument.  */
7488   Fput (Qcall_process, Qtarget_idx, make_number (0));
7489
7490   Qcall_process_region = intern ("call-process-region");
7491   staticpro (&Qcall_process_region);
7492   /* Target PROGRAM is the third argument.  */
7493   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7494
7495   Qstart_process = intern ("start-process");
7496   staticpro (&Qstart_process);
7497   /* Target PROGRAM is the third argument.  */
7498   Fput (Qstart_process, Qtarget_idx, make_number (2));
7499
7500   Qopen_network_stream = intern ("open-network-stream");
7501   staticpro (&Qopen_network_stream);
7502   /* Target SERVICE is the fourth argument.  */
7503   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7504
7505   Qcoding_system = intern ("coding-system");
7506   staticpro (&Qcoding_system);
7507
7508   Qeol_type = intern ("eol-type");
7509   staticpro (&Qeol_type);
7510
7511   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7512   staticpro (&Qbuffer_file_coding_system);
7513
7514   Qpost_read_conversion = intern ("post-read-conversion");
7515   staticpro (&Qpost_read_conversion);
7516
7517   Qpre_write_conversion = intern ("pre-write-conversion");
7518   staticpro (&Qpre_write_conversion);
7519
7520   Qno_conversion = intern ("no-conversion");
7521   staticpro (&Qno_conversion);
7522
7523   Qundecided = intern ("undecided");
7524   staticpro (&Qundecided);
7525
7526   Qcoding_system_p = intern ("coding-system-p");
7527   staticpro (&Qcoding_system_p);
7528
7529   Qcoding_system_error = intern ("coding-system-error");
7530   staticpro (&Qcoding_system_error);
7531
7532   Fput (Qcoding_system_error, Qerror_conditions,
7533         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7534   Fput (Qcoding_system_error, Qerror_message,
7535         build_string ("Invalid coding system"));
7536
7537   Qcoding_category = intern ("coding-category");
7538   staticpro (&Qcoding_category);
7539   Qcoding_category_index = intern ("coding-category-index");
7540   staticpro (&Qcoding_category_index);
7541
7542   Vcoding_category_table
7543     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7544   staticpro (&Vcoding_category_table);
7545   {
7546     int i;
7547     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7548       {
7549         XVECTOR (Vcoding_category_table)->contents[i]
7550           = intern (coding_category_name[i]);
7551         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7552               Qcoding_category_index, make_number (i));
7553       }
7554   }
7555
7556   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7557   staticpro (&Vcoding_system_safe_chars);
7558
7559   Qtranslation_table = intern ("translation-table");
7560   staticpro (&Qtranslation_table);
7561   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7562
7563   Qtranslation_table_id = intern ("translation-table-id");
7564   staticpro (&Qtranslation_table_id);
7565
7566   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7567   staticpro (&Qtranslation_table_for_decode);
7568
7569   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7570   staticpro (&Qtranslation_table_for_encode);
7571
7572   Qsafe_chars = intern ("safe-chars");
7573   staticpro (&Qsafe_chars);
7574
7575   Qchar_coding_system = intern ("char-coding-system");
7576   staticpro (&Qchar_coding_system);
7577
7578   /* Intern this now in case it isn't already done.
7579      Setting this variable twice is harmless.
7580      But don't staticpro it here--that is done in alloc.c.  */
7581   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7582   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7583   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7584
7585   Qvalid_codes = intern ("valid-codes");
7586   staticpro (&Qvalid_codes);
7587
7588   Qemacs_mule = intern ("emacs-mule");
7589   staticpro (&Qemacs_mule);
7590
7591   Qraw_text = intern ("raw-text");
7592   staticpro (&Qraw_text);
7593
7594   Qutf_8 = intern ("utf-8");
7595   staticpro (&Qutf_8);
7596
7597   defsubr (&Scoding_system_p);
7598   defsubr (&Sread_coding_system);
7599   defsubr (&Sread_non_nil_coding_system);
7600   defsubr (&Scheck_coding_system);
7601   defsubr (&Sdetect_coding_region);
7602   defsubr (&Sdetect_coding_string);
7603   defsubr (&Sfind_coding_systems_region_internal);
7604   defsubr (&Sunencodable_char_position);
7605   defsubr (&Sdecode_coding_region);
7606   defsubr (&Sencode_coding_region);
7607   defsubr (&Sdecode_coding_string);
7608   defsubr (&Sencode_coding_string);
7609   defsubr (&Sdecode_sjis_char);
7610   defsubr (&Sencode_sjis_char);
7611   defsubr (&Sdecode_big5_char);
7612   defsubr (&Sencode_big5_char);
7613   defsubr (&Sset_terminal_coding_system_internal);
7614   defsubr (&Sset_safe_terminal_coding_system_internal);
7615   defsubr (&Sterminal_coding_system);
7616   defsubr (&Sset_keyboard_coding_system_internal);
7617   defsubr (&Skeyboard_coding_system);
7618   defsubr (&Sfind_operation_coding_system);
7619   defsubr (&Supdate_coding_systems_internal);
7620   defsubr (&Sset_coding_priority_internal);
7621   defsubr (&Sdefine_coding_system_internal);
7622
7623   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7624                doc: /* List of coding systems.
7625
7626 Do not alter the value of this variable manually.  This variable should be
7627 updated by the functions `make-coding-system' and
7628 `define-coding-system-alias'.  */);
7629   Vcoding_system_list = Qnil;
7630
7631   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7632                doc: /* Alist of coding system names.
7633 Each element is one element list of coding system name.
7634 This variable is given to `completing-read' as TABLE argument.
7635
7636 Do not alter the value of this variable manually.  This variable should be
7637 updated by the functions `make-coding-system' and
7638 `define-coding-system-alias'.  */);
7639   Vcoding_system_alist = Qnil;
7640
7641   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7642                doc: /* List of coding-categories (symbols) ordered by priority.
7643
7644 On detecting a coding system, Emacs tries code detection algorithms
7645 associated with each coding-category one by one in this order.  When
7646 one algorithm agrees with a byte sequence of source text, the coding
7647 system bound to the corresponding coding-category is selected.  */);
7648   {
7649     int i;
7650
7651     Vcoding_category_list = Qnil;
7652     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7653       Vcoding_category_list
7654         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7655                  Vcoding_category_list);
7656   }
7657
7658   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7659                doc: /* Specify the coding system for read operations.
7660 It is useful to bind this variable with `let', but do not set it globally.
7661 If the value is a coding system, it is used for decoding on read operation.
7662 If not, an appropriate element is used from one of the coding system alists:
7663 There are three such tables, `file-coding-system-alist',
7664 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7665   Vcoding_system_for_read = Qnil;
7666
7667   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7668                doc: /* Specify the coding system for write operations.
7669 Programs bind this variable with `let', but you should not set it globally.
7670 If the value is a coding system, it is used for encoding of output,
7671 when writing it to a file and when sending it to a file or subprocess.
7672
7673 If this does not specify a coding system, an appropriate element
7674 is used from one of the coding system alists:
7675 There are three such tables, `file-coding-system-alist',
7676 `process-coding-system-alist', and `network-coding-system-alist'.
7677 For output to files, if the above procedure does not specify a coding system,
7678 the value of `buffer-file-coding-system' is used.  */);
7679   Vcoding_system_for_write = Qnil;
7680
7681   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7682                doc: /* Coding system used in the latest file or process I/O.
7683 Also set by `encode-coding-region', `decode-coding-region',
7684 `encode-coding-string' and `decode-coding-string'.  */);
7685   Vlast_coding_system_used = Qnil;
7686
7687   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7688                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7689 See info node `Coding Systems' and info node `Text and Binary' concerning
7690 such conversion.  */);
7691   inhibit_eol_conversion = 0;
7692
7693   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7694                doc: /* Non-nil means process buffer inherits coding system of process output.
7695 Bind it to t if the process output is to be treated as if it were a file
7696 read from some filesystem.  */);
7697   inherit_process_coding_system = 0;
7698
7699   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7700                doc: /* Alist to decide a coding system to use for a file I/O operation.
7701 The format is ((PATTERN . VAL) ...),
7702 where PATTERN is a regular expression matching a file name,
7703 VAL is a coding system, a cons of coding systems, or a function symbol.
7704 If VAL is a coding system, it is used for both decoding and encoding
7705 the file contents.
7706 If VAL is a cons of coding systems, the car part is used for decoding,
7707 and the cdr part is used for encoding.
7708 If VAL is a function symbol, the function must return a coding system
7709 or a cons of coding systems which are used as above.  The function gets
7710 the arguments with which `find-operation-coding-system' was called.
7711
7712 See also the function `find-operation-coding-system'
7713 and the variable `auto-coding-alist'.  */);
7714   Vfile_coding_system_alist = Qnil;
7715
7716   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7717     doc: /* Alist to decide a coding system to use for a process I/O operation.
7718 The format is ((PATTERN . VAL) ...),
7719 where PATTERN is a regular expression matching a program name,
7720 VAL is a coding system, a cons of coding systems, or a function symbol.
7721 If VAL is a coding system, it is used for both decoding what received
7722 from the program and encoding what sent to the program.
7723 If VAL is a cons of coding systems, the car part is used for decoding,
7724 and the cdr part is used for encoding.
7725 If VAL is a function symbol, the function must return a coding system
7726 or a cons of coding systems which are used as above.
7727
7728 See also the function `find-operation-coding-system'.  */);
7729   Vprocess_coding_system_alist = Qnil;
7730
7731   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7732     doc: /* Alist to decide a coding system to use for a network I/O operation.
7733 The format is ((PATTERN . VAL) ...),
7734 where PATTERN is a regular expression matching a network service name
7735 or is a port number to connect to,
7736 VAL is a coding system, a cons of coding systems, or a function symbol.
7737 If VAL is a coding system, it is used for both decoding what received
7738 from the network stream and encoding what sent to the network stream.
7739 If VAL is a cons of coding systems, the car part is used for decoding,
7740 and the cdr part is used for encoding.
7741 If VAL is a function symbol, the function must return a coding system
7742 or a cons of coding systems which are used as above.
7743
7744 See also the function `find-operation-coding-system'.  */);
7745   Vnetwork_coding_system_alist = Qnil;
7746
7747   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7748                doc: /* Coding system to use with system messages.
7749 Also used for decoding keyboard input on X Window system.  */);
7750   Vlocale_coding_system = Qnil;
7751
7752   /* The eol mnemonics are reset in startup.el system-dependently.  */
7753   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7754                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7755   eol_mnemonic_unix = build_string (":");
7756
7757   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7758                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7759   eol_mnemonic_dos = build_string ("\\");
7760
7761   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7762                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7763   eol_mnemonic_mac = build_string ("/");
7764
7765   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7766                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7767   eol_mnemonic_undecided = build_string (":");
7768
7769   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7770                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7771   Venable_character_translation = Qt;
7772
7773   DEFVAR_LISP ("standard-translation-table-for-decode",
7774                &Vstandard_translation_table_for_decode,
7775                doc: /* Table for translating characters while decoding.  */);
7776   Vstandard_translation_table_for_decode = Qnil;
7777
7778   DEFVAR_LISP ("standard-translation-table-for-encode",
7779                &Vstandard_translation_table_for_encode,
7780                doc: /* Table for translating characters while encoding.  */);
7781   Vstandard_translation_table_for_encode = Qnil;
7782
7783   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7784                doc: /* Alist of charsets vs revision numbers.
7785 While encoding, if a charset (car part of an element) is found,
7786 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7787   Vcharset_revision_alist = Qnil;
7788
7789   DEFVAR_LISP ("default-process-coding-system",
7790                &Vdefault_process_coding_system,
7791                doc: /* Cons of coding systems used for process I/O by default.
7792 The car part is used for decoding a process output,
7793 the cdr part is used for encoding a text to be sent to a process.  */);
7794   Vdefault_process_coding_system = Qnil;
7795
7796   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7797                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7798 This is a vector of length 256.
7799 If Nth element is non-nil, the existence of code N in a file
7800 \(or output of subprocess) doesn't prevent it to be detected as
7801 a coding system of ISO 2022 variant which has a flag
7802 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7803 or reading output of a subprocess.
7804 Only 128th through 159th elements has a meaning.  */);
7805   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7806
7807   DEFVAR_LISP ("select-safe-coding-system-function",
7808                &Vselect_safe_coding_system_function,
7809                doc: /* Function to call to select safe coding system for encoding a text.
7810
7811 If set, this function is called to force a user to select a proper
7812 coding system which can encode the text in the case that a default
7813 coding system used in each operation can't encode the text.
7814
7815 The default value is `select-safe-coding-system' (which see).  */);
7816   Vselect_safe_coding_system_function = Qnil;
7817
7818   DEFVAR_BOOL ("coding-system-require-warning",
7819                &coding_system_require_warning,
7820                doc: /* Internal use only.
7821 If non-nil, on writing a file, `select-safe-coding-system-function' is
7822 called even if `coding-system-for-write' is non-nil.  The command
7823 `universal-coding-system-argument' binds this variable to t temporarily.  */);
7824   coding_system_require_warning = 0;
7825
7826
7827   DEFVAR_BOOL ("inhibit-iso-escape-detection",
7828                &inhibit_iso_escape_detection,
7829                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7830
7831 By default, on reading a file, Emacs tries to detect how the text is
7832 encoded.  This code detection is sensitive to escape sequences.  If
7833 the sequence is valid as ISO2022, the code is determined as one of
7834 the ISO2022 encodings, and the file is decoded by the corresponding
7835 coding system (e.g. `iso-2022-7bit').
7836
7837 However, there may be a case that you want to read escape sequences in
7838 a file as is.  In such a case, you can set this variable to non-nil.
7839 Then, as the code detection ignores any escape sequences, no file is
7840 detected as encoded in some ISO2022 encoding.  The result is that all
7841 escape sequences become visible in a buffer.
7842
7843 The default value is nil, and it is strongly recommended not to change
7844 it.  That is because many Emacs Lisp source files that contain
7845 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7846 in Emacs's distribution, and they won't be decoded correctly on
7847 reading if you suppress escape sequence detection.
7848
7849 The other way to read escape sequences in a file without decoding is
7850 to explicitly specify some coding system that doesn't use ISO2022's
7851 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
7852   inhibit_iso_escape_detection = 0;
7853
7854   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
7855                doc: /* Char table for translating self-inserting characters.
7856 This is applied to the result of input methods, not their input.  See also
7857 `keyboard-translate-table'.  */);
7858     Vtranslation_table_for_input = Qnil;
7859 }
7860
7861 char *
7862 emacs_strerror (error_number)
7863      int error_number;
7864 {
7865   char *str;
7866
7867   synchronize_system_messages_locale ();
7868   str = strerror (error_number);
7869
7870   if (! NILP (Vlocale_coding_system))
7871     {
7872       Lisp_Object dec = code_convert_string_norecord (build_string (str),
7873                                                       Vlocale_coding_system,
7874                                                       0);
7875       str = (char *) SDATA (dec);
7876     }
7877
7878   return str;
7879 }
7880
7881 #endif /* emacs */
7882