src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995,97,1998,2002,2003  Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001,2002,2003  Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 158
 159   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 160   internal multibyte format to CODING.  The resulting unibyte text
 161   goes to a place pointed to by DESTINATION, the length of which
 162   should not exceed DST_BYTES.
 163
 164   These functions set the information about original and encoded texts
 165   in the members `produced', `produced_char', `consumed', and
 166   `consumed_char' of the structure *CODING.  They also set the member
 167   `result' to one of CODING_FINISH_XXX indicating how the encoding
 168   finished.
 169
 170   DST_BYTES zero means that the source area and destination area are
 171   overlapped, which means that we can produce encoded text until it
 172   reaches at the head of the not-yet-encoded source text.
 173
 174   Below is a template for these functions.  */
 175 #if 0
 176 static void
 177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 178      struct coding_system *coding;
 179      unsigned char *source, *destination;
 180      int src_bytes, dst_bytes;
 181 {
 182   ...
 183 }
 184 #endif
 185
 186 /*** COMMONLY USED MACROS ***/
 187
 188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 189    get one, two, and three bytes from the source text respectively.
 190    If there are not enough bytes in the source, they jump to
 191    `label_end_of_loop'.  The caller should set variables `coding',
 192    `src' and `src_end' to appropriate pointer in advance.  These
 193    macros are called from decoding routines `decode_coding_XXX', thus
 194    it is assumed that the source text is unibyte.  */
 195
 196 #define ONE_MORE_BYTE(c1)                                       \
 197   do {                                                          \
 198     if (src >= src_end)                                         \
 199       {                                                         \
 200         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 201         goto label_end_of_loop;                                 \
 202       }                                                         \
 203     c1 = *src++;                                                \
 204   } while (0)
 205
 206 #define TWO_MORE_BYTES(c1, c2)                                  \
 207   do {                                                          \
 208     if (src + 1 >= src_end)                                     \
 209       {                                                         \
 210         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 211         goto label_end_of_loop;                                 \
 212       }                                                         \
 213     c1 = *src++;                                                \
 214     c2 = *src++;                                                \
 215   } while (0)
 216
 217
 218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 219    form if MULTIBYTEP is nonzero.  */
 220
 221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 222   do {                                                          \
 223     if (src >= src_end)                                         \
 224       {                                                         \
 225         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 226         goto label_end_of_loop;                                 \
 227       }                                                         \
 228     c1 = *src++;                                                \
 229     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 230       c1 = *src++ - 0x20;                                       \
 231   } while (0)
 232
 233 /* Set C to the next character at the source text pointed by `src'.
 234    If there are not enough characters in the source, jump to
 235    `label_end_of_loop'.  The caller should set variables `coding'
 236    `src', `src_end', and `translation_table' to appropriate pointers
 237    in advance.  This macro is used in encoding routines
 238    `encode_coding_XXX', thus it assumes that the source text is in
 239    multibyte form except for 8-bit characters.  8-bit characters are
 240    in multibyte form if coding->src_multibyte is nonzero, else they
 241    are represented by a single byte.  */
 242
 243 #define ONE_MORE_CHAR(c)                                        \
 244   do {                                                          \
 245     int len = src_end - src;                                    \
 246     int bytes;                                                  \
 247     if (len <= 0)                                               \
 248       {                                                         \
 249         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 250         goto label_end_of_loop;                                 \
 251       }                                                         \
 252     if (coding->src_multibyte                                   \
 253         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 254       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 255     else                                                        \
 256       c = *src, bytes = 1;                                      \
 257     if (!NILP (translation_table))                              \
 258       c = translate_char (translation_table, c, -1, 0, 0);      \
 259     src += bytes;                                               \
 260   } while (0)
 261
 262
 263 /* Produce a multibyte form of character C to `dst'.  Jump to
 264    `label_end_of_loop' if there's not enough space at `dst'.
 265
 266    If we are now in the middle of a composition sequence, the decoded
 267    character may be ALTCHAR (for the current composition).  In that
 268    case, the character goes to coding->cmp_data->data instead of
 269    `dst'.
 270
 271    This macro is used in decoding routines.  */
 272
 273 #define EMIT_CHAR(c)                                                    \
 274   do {                                                                  \
 275     if (! COMPOSING_P (coding)                                          \
 276         || coding->composing == COMPOSITION_RELATIVE                    \
 277         || coding->composing == COMPOSITION_WITH_RULE)                  \
 278       {                                                                 \
 279         int bytes = CHAR_BYTES (c);                                     \
 280         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 281           {                                                             \
 282             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 283             goto label_end_of_loop;                                     \
 284           }                                                             \
 285         dst += CHAR_STRING (c, dst);                                    \
 286         coding->produced_char++;                                        \
 287       }                                                                 \
 288                                                                         \
 289     if (COMPOSING_P (coding)                                            \
 290         && coding->composing != COMPOSITION_RELATIVE)                   \
 291       {                                                                 \
 292         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 293         coding->composition_rule_follows                                \
 294           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 295       }                                                                 \
 296   } while (0)
 297
 298
 299 #define EMIT_ONE_BYTE(c)                                        \
 300   do {                                                          \
 301     if (dst >= (dst_bytes ? dst_end : src))                     \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     *dst++ = c;                                                 \
 307   } while (0)
 308
 309 #define EMIT_TWO_BYTES(c1, c2)                                  \
 310   do {                                                          \
 311     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 312       {                                                         \
 313         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 314         goto label_end_of_loop;                                 \
 315       }                                                         \
 316     *dst++ = c1, *dst++ = c2;                                   \
 317   } while (0)
 318
 319 #define EMIT_BYTES(from, to)                                    \
 320   do {                                                          \
 321     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 322       {                                                         \
 323         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 324         goto label_end_of_loop;                                 \
 325       }                                                         \
 326     while (from < to)                                           \
 327       *dst++ = *from++;                                         \
 328   } while (0)
 329
 330 \f
 331 /*** 1. Preamble ***/
 332
 333 #ifdef emacs
 334 #include <config.h>
 335 #endif
 336
 337 #include <stdio.h>
 338
 339 #ifdef emacs
 340
 341 #include "lisp.h"
 342 #include "buffer.h"
 343 #include "charset.h"
 344 #include "composite.h"
 345 #include "ccl.h"
 346 #include "coding.h"
 347 #include "window.h"
 348
 349 #else  /* not emacs */
 350
 351 #include "mulelib.h"
 352
 353 #endif /* not emacs */
 354
 355 Lisp_Object Qcoding_system, Qeol_type;
 356 Lisp_Object Qbuffer_file_coding_system;
 357 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 358 Lisp_Object Qno_conversion, Qundecided;
 359 Lisp_Object Qcoding_system_history;
 360 Lisp_Object Qsafe_chars;
 361 Lisp_Object Qvalid_codes;
 362
 363 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 364 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 365 Lisp_Object Qstart_process, Qopen_network_stream;
 366 Lisp_Object Qtarget_idx;
 367
 368 Lisp_Object Vselect_safe_coding_system_function;
 369
 370 int coding_system_require_warning;
 371
 372 /* Mnemonic string for each format of end-of-line.  */
 373 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 374 /* Mnemonic string to indicate format of end-of-line is not yet
 375    decided.  */
 376 Lisp_Object eol_mnemonic_undecided;
 377
 378 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 379    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 380 int system_eol_type;
 381
 382 #ifdef emacs
 383
 384 /* Information about which coding system is safe for which chars.
 385    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 386
 387    GENERIC-LIST is a list of generic coding systems which can encode
 388    any characters.
 389
 390    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 391    corresponding char table that contains safe chars.  */
 392 Lisp_Object Vcoding_system_safe_chars;
 393
 394 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 395
 396 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 397
 398 /* Coding system emacs-mule and raw-text are for converting only
 399    end-of-line format.  */
 400 Lisp_Object Qemacs_mule, Qraw_text;
 401
 402 /* Coding-systems are handed between Emacs Lisp programs and C internal
 403    routines by the following three variables.  */
 404 /* Coding-system for reading files and receiving data from process.  */
 405 Lisp_Object Vcoding_system_for_read;
 406 /* Coding-system for writing files and sending data to process.  */
 407 Lisp_Object Vcoding_system_for_write;
 408 /* Coding-system actually used in the latest I/O.  */
 409 Lisp_Object Vlast_coding_system_used;
 410
 411 /* A vector of length 256 which contains information about special
 412    Latin codes (especially for dealing with Microsoft codes).  */
 413 Lisp_Object Vlatin_extra_code_table;
 414
 415 /* Flag to inhibit code conversion of end-of-line format.  */
 416 int inhibit_eol_conversion;
 417
 418 /* Flag to inhibit ISO2022 escape sequence detection.  */
 419 int inhibit_iso_escape_detection;
 420
 421 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 422 int inherit_process_coding_system;
 423
 424 /* Coding system to be used to encode text for terminal display.  */
 425 struct coding_system terminal_coding;
 426
 427 /* Coding system to be used to encode text for terminal display when
 428    terminal coding system is nil.  */
 429 struct coding_system safe_terminal_coding;
 430
 431 /* Coding system of what is sent from terminal keyboard.  */
 432 struct coding_system keyboard_coding;
 433
 434 /* Default coding system to be used to write a file.  */
 435 struct coding_system default_buffer_file_coding;
 436
 437 Lisp_Object Vfile_coding_system_alist;
 438 Lisp_Object Vprocess_coding_system_alist;
 439 Lisp_Object Vnetwork_coding_system_alist;
 440
 441 Lisp_Object Vlocale_coding_system;
 442
 443 #endif /* emacs */
 444
 445 Lisp_Object Qcoding_category, Qcoding_category_index;
 446
 447 /* List of symbols `coding-category-xxx' ordered by priority.  */
 448 Lisp_Object Vcoding_category_list;
 449
 450 /* Table of coding categories (Lisp symbols).  */
 451 Lisp_Object Vcoding_category_table;
 452
 453 /* Table of names of symbol for each coding-category.  */
 454 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 455   "coding-category-emacs-mule",
 456   "coding-category-sjis",
 457   "coding-category-iso-7",
 458   "coding-category-iso-7-tight",
 459   "coding-category-iso-8-1",
 460   "coding-category-iso-8-2",
 461   "coding-category-iso-7-else",
 462   "coding-category-iso-8-else",
 463   "coding-category-ccl",
 464   "coding-category-big5",
 465   "coding-category-utf-8",
 466   "coding-category-utf-16-be",
 467   "coding-category-utf-16-le",
 468   "coding-category-raw-text",
 469   "coding-category-binary"
 470 };
 471
 472 /* Table of pointers to coding systems corresponding to each coding
 473    categories.  */
 474 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 475
 476 /* Table of coding category masks.  Nth element is a mask for a coding
 477    category of which priority is Nth.  */
 478 static
 479 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 480
 481 /* Flag to tell if we look up translation table on character code
 482    conversion.  */
 483 Lisp_Object Venable_character_translation;
 484 /* Standard translation table to look up on decoding (reading).  */
 485 Lisp_Object Vstandard_translation_table_for_decode;
 486 /* Standard translation table to look up on encoding (writing).  */
 487 Lisp_Object Vstandard_translation_table_for_encode;
 488
 489 Lisp_Object Qtranslation_table;
 490 Lisp_Object Qtranslation_table_id;
 491 Lisp_Object Qtranslation_table_for_decode;
 492 Lisp_Object Qtranslation_table_for_encode;
 493
 494 /* Alist of charsets vs revision number.  */
 495 Lisp_Object Vcharset_revision_alist;
 496
 497 /* Default coding systems used for process I/O.  */
 498 Lisp_Object Vdefault_process_coding_system;
 499
 500 /* Char table for translating Quail and self-inserting input.  */
 501 Lisp_Object Vtranslation_table_for_input;
 502
 503 /* Global flag to tell that we can't call post-read-conversion and
 504    pre-write-conversion functions.  Usually the value is zero, but it
 505    is set to 1 temporarily while such functions are running.  This is
 506    to avoid infinite recursive call.  */
 507 static int inhibit_pre_post_conversion;
 508
 509 Lisp_Object Qchar_coding_system;
 510
 511 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 512    its validity.  */
 513
 514 Lisp_Object
 515 coding_safe_chars (coding_system)
 516      Lisp_Object coding_system;
 517 {
 518   Lisp_Object coding_spec, plist, safe_chars;
 519
 520   coding_spec = Fget (coding_system, Qcoding_system);
 521   plist = XVECTOR (coding_spec)->contents[3];
 522   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 523   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 524 }
 525
 526 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 527   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 528
 529 \f
 530 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 531
 532 /* Emacs' internal format for representation of multiple character
 533    sets is a kind of multi-byte encoding, i.e. characters are
 534    represented by variable-length sequences of one-byte codes.
 535
 536    ASCII characters and control characters (e.g. `tab', `newline') are
 537    represented by one-byte sequences which are their ASCII codes, in
 538    the range 0x00 through 0x7F.
 539
 540    8-bit characters of the range 0x80..0x9F are represented by
 541    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 542    code + 0x20).
 543
 544    8-bit characters of the range 0xA0..0xFF are represented by
 545    one-byte sequences which are their 8-bit code.
 546
 547    The other characters are represented by a sequence of `base
 548    leading-code', optional `extended leading-code', and one or two
 549    `position-code's.  The length of the sequence is determined by the
 550    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 551    whereas extended leading-code and position-code take the range 0xA0
 552    through 0xFF.  See `charset.h' for more details about leading-code
 553    and position-code.
 554
 555    --- CODE RANGE of Emacs' internal format ---
 556    character set        range
 557    -------------        -----
 558    ascii                0x00..0x7F
 559    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 560    eight-bit-graphic    0xA0..0xBF
 561    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 562    ---------------------------------------------
 563
 564    As this is the internal character representation, the format is
 565    usually not used externally (i.e. in a file or in a data sent to a
 566    process).  But, it is possible to have a text externally in this
 567    format (i.e. by encoding by the coding system `emacs-mule').
 568
 569    In that case, a sequence of one-byte codes has a slightly different
 570    form.
 571
 572    Firstly, all characters in eight-bit-control are represented by
 573    one-byte sequences which are their 8-bit code.
 574
 575    Next, character composition data are represented by the byte
 576    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 577    where,
 578         METHOD is 0xF0 plus one of composition method (enum
 579         composition_method),
 580
 581         BYTES is 0xA0 plus the byte length of these composition data,
 582
 583         CHARS is 0xA0 plus the number of characters composed by these
 584         data,
 585
 586         COMPONENTs are characters of multibyte form or composition
 587         rules encoded by two-byte of ASCII codes.
 588
 589    In addition, for backward compatibility, the following formats are
 590    also recognized as composition data on decoding.
 591
 592    0x80 MSEQ ...
 593    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 594
 595    Here,
 596         MSEQ is a multibyte form but in these special format:
 597           ASCII: 0xA0 ASCII_CODE+0x80,
 598           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 599         RULE is a one byte code of the range 0xA0..0xF0 that
 600         represents a composition rule.
 601   */
 602
 603 enum emacs_code_class_type emacs_code_class[256];
 604
 605 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 606    Check if a text is encoded in Emacs' internal format.  If it is,
 607    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 608
 609 static int
 610 detect_coding_emacs_mule (src, src_end, multibytep)
 611       unsigned char *src, *src_end;
 612       int multibytep;
 613 {
 614   unsigned char c;
 615   int composing = 0;
 616   /* Dummy for ONE_MORE_BYTE.  */
 617   struct coding_system dummy_coding;
 618   struct coding_system *coding = &dummy_coding;
 619
 620   while (1)
 621     {
 622       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 623
 624       if (composing)
 625         {
 626           if (c < 0xA0)
 627             composing = 0;
 628           else if (c == 0xA0)
 629             {
 630               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 631               c &= 0x7F;
 632             }
 633           else
 634             c -= 0x20;
 635         }
 636
 637       if (c < 0x20)
 638         {
 639           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 640             return 0;
 641         }
 642       else if (c >= 0x80 && c < 0xA0)
 643         {
 644           if (c == 0x80)
 645             /* Old leading code for a composite character.  */
 646             composing = 1;
 647           else
 648             {
 649               unsigned char *src_base = src - 1;
 650               int bytes;
 651
 652               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 653                                                bytes))
 654                 return 0;
 655               src = src_base + bytes;
 656             }
 657         }
 658     }
 659  label_end_of_loop:
 660   return CODING_CATEGORY_MASK_EMACS_MULE;
 661 }
 662
 663
 664 /* Record the starting position START and METHOD of one composition.  */
 665
 666 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 667   do {                                                          \
 668     struct composition_data *cmp_data = coding->cmp_data;       \
 669     int *data = cmp_data->data + cmp_data->used;                \
 670     coding->cmp_data_start = cmp_data->used;                    \
 671     data[0] = -1;                                               \
 672     data[1] = cmp_data->char_offset + start;                    \
 673     data[3] = (int) method;                                     \
 674     cmp_data->used += 4;                                        \
 675   } while (0)
 676
 677 /* Record the ending position END of the current composition.  */
 678
 679 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 680   do {                                                          \
 681     struct composition_data *cmp_data = coding->cmp_data;       \
 682     int *data = cmp_data->data + coding->cmp_data_start;        \
 683     data[0] = cmp_data->used - coding->cmp_data_start;          \
 684     data[2] = cmp_data->char_offset + end;                      \
 685   } while (0)
 686
 687 /* Record one COMPONENT (alternate character or composition rule).  */
 688
 689 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 690   do {                                                                  \
 691     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 692     if (coding->cmp_data->used - coding->cmp_data_start                 \
 693         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 694       {                                                                 \
 695         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 696         coding->composing = COMPOSITION_NO;                             \
 697       }                                                                 \
 698   } while (0)
 699
 700
 701 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 702    is not less than SRC_END, return -1 without incrementing Src.  */
 703
 704 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 705
 706
 707 /* Decode a character represented as a component of composition
 708    sequence of Emacs 20 style at SRC.  Set C to that character, store
 709    its multibyte form sequence at P, and set P to the end of that
 710    sequence.  If no valid character is found, set C to -1.  */
 711
 712 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 713   do {                                                          \
 714     int bytes;                                                  \
 715                                                                 \
 716     c = SAFE_ONE_MORE_BYTE ();                                  \
 717     if (c < 0)                                                  \
 718       break;                                                    \
 719     if (CHAR_HEAD_P (c))                                        \
 720       c = -1;                                                   \
 721     else if (c == 0xA0)                                         \
 722       {                                                         \
 723         c = SAFE_ONE_MORE_BYTE ();                              \
 724         if (c < 0xA0)                                           \
 725           c = -1;                                               \
 726         else                                                    \
 727           {                                                     \
 728             c -= 0xA0;                                          \
 729             *p++ = c;                                           \
 730           }                                                     \
 731       }                                                         \
 732     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 733       {                                                         \
 734         unsigned char *p0 = p;                                  \
 735                                                                 \
 736         c -= 0x20;                                              \
 737         *p++ = c;                                               \
 738         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 739         while (--bytes)                                         \
 740           {                                                     \
 741             c = SAFE_ONE_MORE_BYTE ();                          \
 742             if (c < 0)                                          \
 743               break;                                            \
 744             *p++ = c;                                           \
 745           }                                                     \
 746         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes))     \
 747           c = STRING_CHAR (p0, bytes);                          \
 748         else                                                    \
 749           c = -1;                                               \
 750       }                                                         \
 751     else                                                        \
 752       c = -1;                                                   \
 753   } while (0)
 754
 755
 756 /* Decode a composition rule represented as a component of composition
 757    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 758    valid rule is found, set C to -1.  */
 759
 760 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 761   do {                                                  \
 762     c = SAFE_ONE_MORE_BYTE ();                          \
 763     c -= 0xA0;                                          \
 764     if (c < 0 || c >= 81)                               \
 765       c = -1;                                           \
 766     else                                                \
 767       {                                                 \
 768         gref = c / 9, nref = c % 9;                     \
 769         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 770       }                                                 \
 771   } while (0)
 772
 773
 774 /* Decode composition sequence encoded by `emacs-mule' at the source
 775    pointed by SRC.  SRC_END is the end of source.  Store information
 776    of the composition in CODING->cmp_data.
 777
 778    For backward compatibility, decode also a composition sequence of
 779    Emacs 20 style.  In that case, the composition sequence contains
 780    characters that should be extracted into a buffer or string.  Store
 781    those characters at *DESTINATION in multibyte form.
 782
 783    If we encounter an invalid byte sequence, return 0.
 784    If we encounter an insufficient source or destination, or
 785    insufficient space in CODING->cmp_data, return 1.
 786    Otherwise, return consumed bytes in the source.
 787
 788 */
 789 static INLINE int
 790 decode_composition_emacs_mule (coding, src, src_end,
 791                                destination, dst_end, dst_bytes)
 792      struct coding_system *coding;
 793      unsigned char *src, *src_end, **destination, *dst_end;
 794      int dst_bytes;
 795 {
 796   unsigned char *dst = *destination;
 797   int method, data_len, nchars;
 798   unsigned char *src_base = src++;
 799   /* Store components of composition.  */
 800   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 801   int ncomponent;
 802   /* Store multibyte form of characters to be composed.  This is for
 803      Emacs 20 style composition sequence.  */
 804   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 805   unsigned char *bufp = buf;
 806   int c, i, gref, nref;
 807
 808   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 809       >= COMPOSITION_DATA_SIZE)
 810     {
 811       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 812       return -1;
 813     }
 814
 815   ONE_MORE_BYTE (c);
 816   if (c - 0xF0 >= COMPOSITION_RELATIVE
 817            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 818     {
 819       int with_rule;
 820
 821       method = c - 0xF0;
 822       with_rule = (method == COMPOSITION_WITH_RULE
 823                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 824       ONE_MORE_BYTE (c);
 825       data_len = c - 0xA0;
 826       if (data_len < 4
 827           || src_base + data_len > src_end)
 828         return 0;
 829       ONE_MORE_BYTE (c);
 830       nchars = c - 0xA0;
 831       if (c < 1)
 832         return 0;
 833       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 834         {
 835           /* If it is longer than this, it can't be valid.  */
 836           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 837             return 0;
 838
 839           if (ncomponent % 2 && with_rule)
 840             {
 841               ONE_MORE_BYTE (gref);
 842               gref -= 32;
 843               ONE_MORE_BYTE (nref);
 844               nref -= 32;
 845               c = COMPOSITION_ENCODE_RULE (gref, nref);
 846             }
 847           else
 848             {
 849               int bytes;
 850               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 851                 c = STRING_CHAR (src, bytes);
 852               else
 853                 c = *src, bytes = 1;
 854               src += bytes;
 855             }
 856           component[ncomponent] = c;
 857         }
 858     }
 859   else
 860     {
 861       /* This may be an old Emacs 20 style format.  See the comment at
 862          the section 2 of this file.  */
 863       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 864       if (src == src_end
 865           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 866         goto label_end_of_loop;
 867
 868       src_end = src;
 869       src = src_base + 1;
 870       if (c < 0xC0)
 871         {
 872           method = COMPOSITION_RELATIVE;
 873           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 874             {
 875               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 876               if (c < 0)
 877                 break;
 878               component[ncomponent++] = c;
 879             }
 880           if (ncomponent < 2)
 881             return 0;
 882           nchars = ncomponent;
 883         }
 884       else if (c == 0xFF)
 885         {
 886           method = COMPOSITION_WITH_RULE;
 887           src++;
 888           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 889           if (c < 0)
 890             return 0;
 891           component[0] = c;
 892           for (ncomponent = 1;
 893                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 894             {
 895               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 896               if (c < 0)
 897                 break;
 898               component[ncomponent++] = c;
 899               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 900               if (c < 0)
 901                 break;
 902               component[ncomponent++] = c;
 903             }
 904           if (ncomponent < 3)
 905             return 0;
 906           nchars = (ncomponent + 1) / 2;
 907         }
 908       else
 909         return 0;
 910     }
 911
 912   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 913     {
 914       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 915       for (i = 0; i < ncomponent; i++)
 916         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 917       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 918       if (buf < bufp)
 919         {
 920           unsigned char *p = buf;
 921           EMIT_BYTES (p, bufp);
 922           *destination += bufp - buf;
 923           coding->produced_char += nchars;
 924         }
 925       return (src - src_base);
 926     }
 927  label_end_of_loop:
 928   return -1;
 929 }
 930
 931 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 932
 933 static void
 934 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 935      struct coding_system *coding;
 936      unsigned char *source, *destination;
 937      int src_bytes, dst_bytes;
 938 {
 939   unsigned char *src = source;
 940   unsigned char *src_end = source + src_bytes;
 941   unsigned char *dst = destination;
 942   unsigned char *dst_end = destination + dst_bytes;
 943   /* SRC_BASE remembers the start position in source in each loop.
 944      The loop will be exited when there's not enough source code, or
 945      when there's not enough destination area to produce a
 946      character.  */
 947   unsigned char *src_base;
 948
 949   coding->produced_char = 0;
 950   while ((src_base = src) < src_end)
 951     {
 952       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 953       int bytes;
 954
 955       if (*src == '\r')
 956         {
 957           int c = *src++;
 958
 959           if (coding->eol_type == CODING_EOL_CR)
 960             c = '\n';
 961           else if (coding->eol_type == CODING_EOL_CRLF)
 962             {
 963               ONE_MORE_BYTE (c);
 964               if (c != '\n')
 965                 {
 966                   src--;
 967                   c = '\r';
 968                 }
 969             }
 970           *dst++ = c;
 971           coding->produced_char++;
 972           continue;
 973         }
 974       else if (*src == '\n')
 975         {
 976           if ((coding->eol_type == CODING_EOL_CR
 977                || coding->eol_type == CODING_EOL_CRLF)
 978               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 979             {
 980               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 981               goto label_end_of_loop;
 982             }
 983           *dst++ = *src++;
 984           coding->produced_char++;
 985           continue;
 986         }
 987       else if (*src == 0x80 && coding->cmp_data)
 988         {
 989           /* Start of composition data.  */
 990           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
 991                                                          &dst, dst_end,
 992                                                          dst_bytes);
 993           if (consumed < 0)
 994             goto label_end_of_loop;
 995           else if (consumed > 0)
 996             {
 997               src += consumed;
 998               continue;
 999             }
1000           bytes = CHAR_STRING (*src, tmp);
1001           p = tmp;
1002           src++;
1003         }
1004       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
1005         {
1006           p = src;
1007           src += bytes;
1008         }
1009       else
1010         {
1011           bytes = CHAR_STRING (*src, tmp);
1012           p = tmp;
1013           src++;
1014         }
1015       if (dst + bytes >= (dst_bytes ? dst_end : src))
1016         {
1017           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1018           break;
1019         }
1020       while (bytes--) *dst++ = *p++;
1021       coding->produced_char++;
1022     }
1023  label_end_of_loop:
1024   coding->consumed = coding->consumed_char = src_base - source;
1025   coding->produced = dst - destination;
1026 }
1027
1028
1029 /* Encode composition data stored at DATA into a special byte sequence
1030    starting by 0x80.  Update CODING->cmp_data_start and maybe
1031    CODING->cmp_data for the next call.  */
1032
1033 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1034   do {                                                                  \
1035     unsigned char buf[1024], *p0 = buf, *p;                             \
1036     int len = data[0];                                                  \
1037     int i;                                                              \
1038                                                                         \
1039     buf[0] = 0x80;                                                      \
1040     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1041     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1042     p = buf + 4;                                                        \
1043     if (data[3] == COMPOSITION_WITH_RULE                                \
1044         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1045       {                                                                 \
1046         p += CHAR_STRING (data[4], p);                                  \
1047         for (i = 5; i < len; i += 2)                                    \
1048           {                                                             \
1049             int gref, nref;                                             \
1050              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1051             *p++ = 0x20 + gref;                                         \
1052             *p++ = 0x20 + nref;                                         \
1053             p += CHAR_STRING (data[i + 1], p);                          \
1054           }                                                             \
1055       }                                                                 \
1056     else                                                                \
1057       {                                                                 \
1058         for (i = 4; i < len; i++)                                       \
1059           p += CHAR_STRING (data[i], p);                                \
1060       }                                                                 \
1061     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1062                                                                         \
1063     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1064       {                                                                 \
1065         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1066         goto label_end_of_loop;                                         \
1067       }                                                                 \
1068     while (p0 < p)                                                      \
1069       *dst++ = *p0++;                                                   \
1070     coding->cmp_data_start += data[0];                                  \
1071     if (coding->cmp_data_start == coding->cmp_data->used                \
1072         && coding->cmp_data->next)                                      \
1073       {                                                                 \
1074         coding->cmp_data = coding->cmp_data->next;                      \
1075         coding->cmp_data_start = 0;                                     \
1076       }                                                                 \
1077   } while (0)
1078
1079
1080 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1081                             unsigned char *, int, int));
1082
1083 static void
1084 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1085      struct coding_system *coding;
1086      unsigned char *source, *destination;
1087      int src_bytes, dst_bytes;
1088 {
1089   unsigned char *src = source;
1090   unsigned char *src_end = source + src_bytes;
1091   unsigned char *dst = destination;
1092   unsigned char *dst_end = destination + dst_bytes;
1093   unsigned char *src_base;
1094   int c;
1095   int char_offset;
1096   int *data;
1097
1098   Lisp_Object translation_table;
1099
1100   translation_table = Qnil;
1101
1102   /* Optimization for the case that there's no composition.  */
1103   if (!coding->cmp_data || coding->cmp_data->used == 0)
1104     {
1105       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1106       return;
1107     }
1108
1109   char_offset = coding->cmp_data->char_offset;
1110   data = coding->cmp_data->data + coding->cmp_data_start;
1111   while (1)
1112     {
1113       src_base = src;
1114
1115       /* If SRC starts a composition, encode the information about the
1116          composition in advance.  */
1117       if (coding->cmp_data_start < coding->cmp_data->used
1118           && char_offset + coding->consumed_char == data[1])
1119         {
1120           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1121           char_offset = coding->cmp_data->char_offset;
1122           data = coding->cmp_data->data + coding->cmp_data_start;
1123         }
1124
1125       ONE_MORE_CHAR (c);
1126       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1127                         || coding->eol_type == CODING_EOL_CR))
1128         {
1129           if (coding->eol_type == CODING_EOL_CRLF)
1130             EMIT_TWO_BYTES ('\r', c);
1131           else
1132             EMIT_ONE_BYTE ('\r');
1133         }
1134       else if (SINGLE_BYTE_CHAR_P (c))
1135         EMIT_ONE_BYTE (c);
1136       else
1137         EMIT_BYTES (src_base, src);
1138       coding->consumed_char++;
1139     }
1140  label_end_of_loop:
1141   coding->consumed = src_base - source;
1142   coding->produced = coding->produced_char = dst - destination;
1143   return;
1144 }
1145
1146 \f
1147 /*** 3. ISO2022 handlers ***/
1148
1149 /* The following note describes the coding system ISO2022 briefly.
1150    Since the intention of this note is to help understand the
1151    functions in this file, some parts are NOT ACCURATE or are OVERLY
1152    SIMPLIFIED.  For thorough understanding, please refer to the
1153    original document of ISO2022.  This is equivalent to the standard
1154    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1155
1156    ISO2022 provides many mechanisms to encode several character sets
1157    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1158    is encoded using bytes less than 128.  This may make the encoded
1159    text a little bit longer, but the text passes more easily through
1160    several types of gateway, some of which strip off the MSB (Most
1161    Significant Bit).
1162
1163    There are two kinds of character sets: control character sets and
1164    graphic character sets.  The former contain control characters such
1165    as `newline' and `escape' to provide control functions (control
1166    functions are also provided by escape sequences).  The latter
1167    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1168    two control character sets and many graphic character sets.
1169
1170    Graphic character sets are classified into one of the following
1171    four classes, according to the number of bytes (DIMENSION) and
1172    number of characters in one dimension (CHARS) of the set:
1173    - DIMENSION1_CHARS94
1174    - DIMENSION1_CHARS96
1175    - DIMENSION2_CHARS94
1176    - DIMENSION2_CHARS96
1177
1178    In addition, each character set is assigned an identification tag,
1179    unique for each set, called the "final character" (denoted as <F>
1180    hereafter).  The <F> of each character set is decided by ECMA(*)
1181    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1182    (0x30..0x3F are for private use only).
1183
1184    Note (*): ECMA = European Computer Manufacturers Association
1185
1186    Here are examples of graphic character sets [NAME(<F>)]:
1187         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1188         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1189         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1190         o DIMENSION2_CHARS96 -- none for the moment
1191
1192    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1193         C0 [0x00..0x1F] -- control character plane 0
1194         GL [0x20..0x7F] -- graphic character plane 0
1195         C1 [0x80..0x9F] -- control character plane 1
1196         GR [0xA0..0xFF] -- graphic character plane 1
1197
1198    A control character set is directly designated and invoked to C0 or
1199    C1 by an escape sequence.  The most common case is that:
1200    - ISO646's  control character set is designated/invoked to C0, and
1201    - ISO6429's control character set is designated/invoked to C1,
1202    and usually these designations/invocations are omitted in encoded
1203    text.  In a 7-bit environment, only C0 can be used, and a control
1204    character for C1 is encoded by an appropriate escape sequence to
1205    fit into the environment.  All control characters for C1 are
1206    defined to have corresponding escape sequences.
1207
1208    A graphic character set is at first designated to one of four
1209    graphic registers (G0 through G3), then these graphic registers are
1210    invoked to GL or GR.  These designations and invocations can be
1211    done independently.  The most common case is that G0 is invoked to
1212    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1213    these invocations and designations are omitted in encoded text.
1214    In a 7-bit environment, only GL can be used.
1215
1216    When a graphic character set of CHARS94 is invoked to GL, codes
1217    0x20 and 0x7F of the GL area work as control characters SPACE and
1218    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1219    be used.
1220
1221    There are two ways of invocation: locking-shift and single-shift.
1222    With locking-shift, the invocation lasts until the next different
1223    invocation, whereas with single-shift, the invocation affects the
1224    following character only and doesn't affect the locking-shift
1225    state.  Invocations are done by the following control characters or
1226    escape sequences:
1227
1228    ----------------------------------------------------------------------
1229    abbrev  function                  cntrl escape seq   description
1230    ----------------------------------------------------------------------
1231    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1232    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1233    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1234    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1235    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1236    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1237    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1238    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1239    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1240    ----------------------------------------------------------------------
1241    (*) These are not used by any known coding system.
1242
1243    Control characters for these functions are defined by macros
1244    ISO_CODE_XXX in `coding.h'.
1245
1246    Designations are done by the following escape sequences:
1247    ----------------------------------------------------------------------
1248    escape sequence      description
1249    ----------------------------------------------------------------------
1250    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1251    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1252    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1253    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1254    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1255    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1256    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1257    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1258    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1259    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1260    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1261    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1262    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1263    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1264    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1265    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1266    ----------------------------------------------------------------------
1267
1268    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1269    of dimension 1, chars 94, and final character <F>, etc...
1270
1271    Note (*): Although these designations are not allowed in ISO2022,
1272    Emacs accepts them on decoding, and produces them on encoding
1273    CHARS96 character sets in a coding system which is characterized as
1274    7-bit environment, non-locking-shift, and non-single-shift.
1275
1276    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1277    '(' can be omitted.  We refer to this as "short-form" hereafter.
1278
1279    Now you may notice that there are a lot of ways of encoding the
1280    same multilingual text in ISO2022.  Actually, there exist many
1281    coding systems such as Compound Text (used in X11's inter client
1282    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1283    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1284    localized platforms), and all of these are variants of ISO2022.
1285
1286    In addition to the above, Emacs handles two more kinds of escape
1287    sequences: ISO6429's direction specification and Emacs' private
1288    sequence for specifying character composition.
1289
1290    ISO6429's direction specification takes the following form:
1291         o CSI ']'      -- end of the current direction
1292         o CSI '0' ']'  -- end of the current direction
1293         o CSI '1' ']'  -- start of left-to-right text
1294         o CSI '2' ']'  -- start of right-to-left text
1295    The control character CSI (0x9B: control sequence introducer) is
1296    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1297
1298    Character composition specification takes the following form:
1299         o ESC '0' -- start relative composition
1300         o ESC '1' -- end composition
1301         o ESC '2' -- start rule-base composition (*)
1302         o ESC '3' -- start relative composition with alternate chars  (**)
1303         o ESC '4' -- start rule-base composition with alternate chars  (**)
1304   Since these are not standard escape sequences of any ISO standard,
1305   the use of them with these meanings is restricted to Emacs only.
1306
1307   (*) This form is used only in Emacs 20.5 and older versions,
1308   but the newer versions can safely decode it.
1309   (**) This form is used only in Emacs 21.1 and newer versions,
1310   and the older versions can't decode it.
1311
1312   Here's a list of example usages of these composition escape
1313   sequences (categorized by `enum composition_method').
1314
1315   COMPOSITION_RELATIVE:
1316         ESC 0 CHAR [ CHAR ] ESC 1
1317   COMPOSITION_WITH_RULE:
1318         ESC 2 CHAR [ RULE CHAR ] ESC 1
1319   COMPOSITION_WITH_ALTCHARS:
1320         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1321   COMPOSITION_WITH_RULE_ALTCHARS:
1322         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1323
1324 enum iso_code_class_type iso_code_class[256];
1325
1326 #define CHARSET_OK(idx, charset, c)                                     \
1327   (coding_system_table[idx]                                             \
1328    && (charset == CHARSET_ASCII                                         \
1329        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1330            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1331    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1332                                               charset)                  \
1333        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1334
1335 #define SHIFT_OUT_OK(idx) \
1336   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1337
1338 #define COMPOSITION_OK(idx)     \
1339   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1340
1341 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1342    Check if a text is encoded in ISO2022.  If it is, return an
1343    integer in which appropriate flag bits any of:
1344         CODING_CATEGORY_MASK_ISO_7
1345         CODING_CATEGORY_MASK_ISO_7_TIGHT
1346         CODING_CATEGORY_MASK_ISO_8_1
1347         CODING_CATEGORY_MASK_ISO_8_2
1348         CODING_CATEGORY_MASK_ISO_7_ELSE
1349         CODING_CATEGORY_MASK_ISO_8_ELSE
1350    are set.  If a code which should never appear in ISO2022 is found,
1351    returns 0.  */
1352
1353 static int
1354 detect_coding_iso2022 (src, src_end, multibytep)
1355      unsigned char *src, *src_end;
1356      int multibytep;
1357 {
1358   int mask = CODING_CATEGORY_MASK_ISO;
1359   int mask_found = 0;
1360   int reg[4], shift_out = 0, single_shifting = 0;
1361   int c, c1, charset;
1362   /* Dummy for ONE_MORE_BYTE.  */
1363   struct coding_system dummy_coding;
1364   struct coding_system *coding = &dummy_coding;
1365   Lisp_Object safe_chars;
1366
1367   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1368   while (mask && src < src_end)
1369     {
1370       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1371     retry:
1372       switch (c)
1373         {
1374         case ISO_CODE_ESC:
1375           if (inhibit_iso_escape_detection)
1376             break;
1377           single_shifting = 0;
1378           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1379           if (c >= '(' && c <= '/')
1380             {
1381               /* Designation sequence for a charset of dimension 1.  */
1382               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1383               if (c1 < ' ' || c1 >= 0x80
1384                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1385                 /* Invalid designation sequence.  Just ignore.  */
1386                 break;
1387               reg[(c - '(') % 4] = charset;
1388             }
1389           else if (c == '$')
1390             {
1391               /* Designation sequence for a charset of dimension 2.  */
1392               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1393               if (c >= '@' && c <= 'B')
1394                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1395                 reg[0] = charset = iso_charset_table[1][0][c];
1396               else if (c >= '(' && c <= '/')
1397                 {
1398                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1399                   if (c1 < ' ' || c1 >= 0x80
1400                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1401                     /* Invalid designation sequence.  Just ignore.  */
1402                     break;
1403                   reg[(c - '(') % 4] = charset;
1404                 }
1405               else
1406                 /* Invalid designation sequence.  Just ignore.  */
1407                 break;
1408             }
1409           else if (c == 'N' || c == 'O')
1410             {
1411               /* ESC <Fe> for SS2 or SS3.  */
1412               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1413               break;
1414             }
1415           else if (c >= '0' && c <= '4')
1416             {
1417               /* ESC <Fp> for start/end composition.  */
1418               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1419                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1420               else
1421                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1422               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1423                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1424               else
1425                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1426               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1427                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1428               else
1429                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1430               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1431                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1432               else
1433                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1434               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1435                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1436               else
1437                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1438               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1439                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1440               else
1441                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1442               break;
1443             }
1444           else
1445             /* Invalid escape sequence.  Just ignore.  */
1446             break;
1447
1448           /* We found a valid designation sequence for CHARSET.  */
1449           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1450           c = MAKE_CHAR (charset, 0, 0);
1451           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1452             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1453           else
1454             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1455           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1456             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1457           else
1458             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1459           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1460             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1461           else
1462             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1463           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1464             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1465           else
1466             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1467           break;
1468
1469         case ISO_CODE_SO:
1470           if (inhibit_iso_escape_detection)
1471             break;
1472           single_shifting = 0;
1473           if (shift_out == 0
1474               && (reg[1] >= 0
1475                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1476                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1477             {
1478               /* Locking shift out.  */
1479               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1480               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1481             }
1482           break;
1483
1484         case ISO_CODE_SI:
1485           if (inhibit_iso_escape_detection)
1486             break;
1487           single_shifting = 0;
1488           if (shift_out == 1)
1489             {
1490               /* Locking shift in.  */
1491               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1492               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1493             }
1494           break;
1495
1496         case ISO_CODE_CSI:
1497           single_shifting = 0;
1498         case ISO_CODE_SS2:
1499         case ISO_CODE_SS3:
1500           {
1501             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1502
1503             if (inhibit_iso_escape_detection)
1504               break;
1505             if (c != ISO_CODE_CSI)
1506               {
1507                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1508                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1509                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1510                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1511                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1512                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1513                 single_shifting = 1;
1514               }
1515             if (VECTORP (Vlatin_extra_code_table)
1516                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1517               {
1518                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1519                     & CODING_FLAG_ISO_LATIN_EXTRA)
1520                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1521                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1522                     & CODING_FLAG_ISO_LATIN_EXTRA)
1523                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1524               }
1525             mask &= newmask;
1526             mask_found |= newmask;
1527           }
1528           break;
1529
1530         default:
1531           if (c < 0x80)
1532             {
1533               single_shifting = 0;
1534               break;
1535             }
1536           else if (c < 0xA0)
1537             {
1538               single_shifting = 0;
1539               if (VECTORP (Vlatin_extra_code_table)
1540                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1541                 {
1542                   int newmask = 0;
1543
1544                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1545                       & CODING_FLAG_ISO_LATIN_EXTRA)
1546                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1547                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1548                       & CODING_FLAG_ISO_LATIN_EXTRA)
1549                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1550                   mask &= newmask;
1551                   mask_found |= newmask;
1552                 }
1553               else
1554                 return 0;
1555             }
1556           else
1557             {
1558               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1559                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1560               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1561               /* Check the length of succeeding codes of the range
1562                  0xA0..0FF.  If the byte length is odd, we exclude
1563                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1564                  when we are not single shifting.  */
1565               if (!single_shifting
1566                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1567                 {
1568                   int i = 1;
1569
1570                   c = -1;
1571                   while (src < src_end)
1572                     {
1573                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1574                       if (c < 0xA0)
1575                         break;
1576                       i++;
1577                     }
1578
1579                   if (i & 1 && src < src_end)
1580                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1581                   else
1582                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1583                   if (c >= 0)
1584                     /* This means that we have read one extra byte.  */
1585                     goto retry;
1586                 }
1587             }
1588           break;
1589         }
1590     }
1591  label_end_of_loop:
1592   return (mask & mask_found);
1593 }
1594
1595 /* Decode a character of which charset is CHARSET, the 1st position
1596    code is C1, the 2nd position code is C2, and return the decoded
1597    character code.  If the variable `translation_table' is non-nil,
1598    returned the translated code.  */
1599
1600 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1601   (NILP (translation_table)                     \
1602    ? MAKE_CHAR (charset, c1, c2)                \
1603    : translate_char (translation_table, -1, charset, c1, c2))
1604
1605 /* Set designation state into CODING.  */
1606 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1607   do {                                                                     \
1608     int charset, c;                                                        \
1609                                                                            \
1610     if (final_char < '0' || final_char >= 128)                             \
1611       goto label_invalid_code;                                             \
1612     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1613                                  make_number (chars),                      \
1614                                  make_number (final_char));                \
1615     c = MAKE_CHAR (charset, 0, 0);                                         \
1616     if (charset >= 0                                                       \
1617         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1618             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1619       {                                                                    \
1620         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1621             && reg == 0                                                    \
1622             && charset == CHARSET_ASCII)                                   \
1623           {                                                                \
1624             /* We should insert this designation sequence as is so         \
1625                that it is surely written back to a file.  */               \
1626             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1627             goto label_invalid_code;                                       \
1628           }                                                                \
1629         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1630         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1631             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1632           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1633         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1634       }                                                                    \
1635     else                                                                   \
1636       {                                                                    \
1637         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1638         goto label_invalid_code;                                           \
1639       }                                                                    \
1640   } while (0)
1641
1642 /* Allocate a memory block for storing information about compositions.
1643    The block is chained to the already allocated blocks.  */
1644
1645 void
1646 coding_allocate_composition_data (coding, char_offset)
1647      struct coding_system *coding;
1648      int char_offset;
1649 {
1650   struct composition_data *cmp_data
1651     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1652
1653   cmp_data->char_offset = char_offset;
1654   cmp_data->used = 0;
1655   cmp_data->prev = coding->cmp_data;
1656   cmp_data->next = NULL;
1657   if (coding->cmp_data)
1658     coding->cmp_data->next = cmp_data;
1659   coding->cmp_data = cmp_data;
1660   coding->cmp_data_start = 0;
1661 }
1662
1663 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1664    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1665    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1666    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1667    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1668   */
1669
1670 #define DECODE_COMPOSITION_START(c1)                                       \
1671   do {                                                                     \
1672     if (coding->composing == COMPOSITION_DISABLED)                         \
1673       {                                                                    \
1674         *dst++ = ISO_CODE_ESC;                                             \
1675         *dst++ = c1 & 0x7f;                                                \
1676         coding->produced_char += 2;                                        \
1677       }                                                                    \
1678     else if (!COMPOSING_P (coding))                                        \
1679       {                                                                    \
1680         /* This is surely the start of a composition.  We must be sure     \
1681            that coding->cmp_data has enough space to store the             \
1682            information about the composition.  If not, terminate the       \
1683            current decoding loop, allocate one more memory block for       \
1684            coding->cmp_data in the caller, then start the decoding         \
1685            loop again.  We can't allocate memory here directly because     \
1686            it may cause buffer/string relocation.  */                      \
1687         if (!coding->cmp_data                                              \
1688             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1689                 >= COMPOSITION_DATA_SIZE))                                 \
1690           {                                                                \
1691             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1692             goto label_end_of_loop;                                        \
1693           }                                                                \
1694         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1695                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1696                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1697                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1698         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1699                                       coding->composing);                  \
1700         coding->composition_rule_follows = 0;                              \
1701       }                                                                    \
1702     else                                                                   \
1703       {                                                                    \
1704         /* We are already handling a composition.  If the method is        \
1705            the following two, the codes following the current escape       \
1706            sequence are actual characters stored in a buffer.  */          \
1707         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1708             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1709           {                                                                \
1710             coding->composing = COMPOSITION_RELATIVE;                      \
1711             coding->composition_rule_follows = 0;                          \
1712           }                                                                \
1713       }                                                                    \
1714   } while (0)
1715
1716 /* Handle composition end sequence ESC 1.  */
1717
1718 #define DECODE_COMPOSITION_END(c1)                                      \
1719   do {                                                                  \
1720     if (! COMPOSING_P (coding))                                         \
1721       {                                                                 \
1722         *dst++ = ISO_CODE_ESC;                                          \
1723         *dst++ = c1;                                                    \
1724         coding->produced_char += 2;                                     \
1725       }                                                                 \
1726     else                                                                \
1727       {                                                                 \
1728         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1729         coding->composing = COMPOSITION_NO;                             \
1730       }                                                                 \
1731   } while (0)
1732
1733 /* Decode a composition rule from the byte C1 (and maybe one more byte
1734    from SRC) and store one encoded composition rule in
1735    coding->cmp_data.  */
1736
1737 #define DECODE_COMPOSITION_RULE(c1)                                     \
1738   do {                                                                  \
1739     int rule = 0;                                                       \
1740     (c1) -= 32;                                                         \
1741     if (c1 < 81)                /* old format (before ver.21) */        \
1742       {                                                                 \
1743         int gref = (c1) / 9;                                            \
1744         int nref = (c1) % 9;                                            \
1745         if (gref == 4) gref = 10;                                       \
1746         if (nref == 4) nref = 10;                                       \
1747         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1748       }                                                                 \
1749     else if (c1 < 93)           /* new format (after ver.21) */         \
1750       {                                                                 \
1751         ONE_MORE_BYTE (c2);                                             \
1752         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1753       }                                                                 \
1754     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1755     coding->composition_rule_follows = 0;                               \
1756   } while (0)
1757
1758
1759 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1760
1761 static void
1762 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1763      struct coding_system *coding;
1764      unsigned char *source, *destination;
1765      int src_bytes, dst_bytes;
1766 {
1767   unsigned char *src = source;
1768   unsigned char *src_end = source + src_bytes;
1769   unsigned char *dst = destination;
1770   unsigned char *dst_end = destination + dst_bytes;
1771   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1772   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1773   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1774   /* SRC_BASE remembers the start position in source in each loop.
1775      The loop will be exited when there's not enough source code
1776      (within macro ONE_MORE_BYTE), or when there's not enough
1777      destination area to produce a character (within macro
1778      EMIT_CHAR).  */
1779   unsigned char *src_base;
1780   int c, charset;
1781   Lisp_Object translation_table;
1782   Lisp_Object safe_chars;
1783
1784   safe_chars = coding_safe_chars (coding->symbol);
1785
1786   if (NILP (Venable_character_translation))
1787     translation_table = Qnil;
1788   else
1789     {
1790       translation_table = coding->translation_table_for_decode;
1791       if (NILP (translation_table))
1792         translation_table = Vstandard_translation_table_for_decode;
1793     }
1794
1795   coding->result = CODING_FINISH_NORMAL;
1796
1797   while (1)
1798     {
1799       int c1, c2;
1800
1801       src_base = src;
1802       ONE_MORE_BYTE (c1);
1803
1804       /* We produce no character or one character.  */
1805       switch (iso_code_class [c1])
1806         {
1807         case ISO_0x20_or_0x7F:
1808           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1809             {
1810               DECODE_COMPOSITION_RULE (c1);
1811               continue;
1812             }
1813           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1814             {
1815               /* This is SPACE or DEL.  */
1816               charset = CHARSET_ASCII;
1817               break;
1818             }
1819           /* This is a graphic character, we fall down ...  */
1820
1821         case ISO_graphic_plane_0:
1822           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1823             {
1824               DECODE_COMPOSITION_RULE (c1);
1825               continue;
1826             }
1827           charset = charset0;
1828           break;
1829
1830         case ISO_0xA0_or_0xFF:
1831           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1832               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1833             goto label_invalid_code;
1834           /* This is a graphic character, we fall down ... */
1835
1836         case ISO_graphic_plane_1:
1837           if (charset1 < 0)
1838             goto label_invalid_code;
1839           charset = charset1;
1840           break;
1841
1842         case ISO_control_0:
1843           if (COMPOSING_P (coding))
1844             DECODE_COMPOSITION_END ('1');
1845
1846           /* All ISO2022 control characters in this class have the
1847              same representation in Emacs internal format.  */
1848           if (c1 == '\n'
1849               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1850               && (coding->eol_type == CODING_EOL_CR
1851                   || coding->eol_type == CODING_EOL_CRLF))
1852             {
1853               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1854               goto label_end_of_loop;
1855             }
1856           charset = CHARSET_ASCII;
1857           break;
1858
1859         case ISO_control_1:
1860           if (COMPOSING_P (coding))
1861             DECODE_COMPOSITION_END ('1');
1862           goto label_invalid_code;
1863
1864         case ISO_carriage_return:
1865           if (COMPOSING_P (coding))
1866             DECODE_COMPOSITION_END ('1');
1867
1868           if (coding->eol_type == CODING_EOL_CR)
1869             c1 = '\n';
1870           else if (coding->eol_type == CODING_EOL_CRLF)
1871             {
1872               ONE_MORE_BYTE (c1);
1873               if (c1 != ISO_CODE_LF)
1874                 {
1875                   src--;
1876                   c1 = '\r';
1877                 }
1878             }
1879           charset = CHARSET_ASCII;
1880           break;
1881
1882         case ISO_shift_out:
1883           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1884               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1885             goto label_invalid_code;
1886           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1887           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1888           continue;
1889
1890         case ISO_shift_in:
1891           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1892             goto label_invalid_code;
1893           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1894           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1895           continue;
1896
1897         case ISO_single_shift_2_7:
1898         case ISO_single_shift_2:
1899           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1900             goto label_invalid_code;
1901           /* SS2 is handled as an escape sequence of ESC 'N' */
1902           c1 = 'N';
1903           goto label_escape_sequence;
1904
1905         case ISO_single_shift_3:
1906           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1907             goto label_invalid_code;
1908           /* SS2 is handled as an escape sequence of ESC 'O' */
1909           c1 = 'O';
1910           goto label_escape_sequence;
1911
1912         case ISO_control_sequence_introducer:
1913           /* CSI is handled as an escape sequence of ESC '[' ...  */
1914           c1 = '[';
1915           goto label_escape_sequence;
1916
1917         case ISO_escape:
1918           ONE_MORE_BYTE (c1);
1919         label_escape_sequence:
1920           /* Escape sequences handled by Emacs are invocation,
1921              designation, direction specification, and character
1922              composition specification.  */
1923           switch (c1)
1924             {
1925             case '&':           /* revision of following character set */
1926               ONE_MORE_BYTE (c1);
1927               if (!(c1 >= '@' && c1 <= '~'))
1928                 goto label_invalid_code;
1929               ONE_MORE_BYTE (c1);
1930               if (c1 != ISO_CODE_ESC)
1931                 goto label_invalid_code;
1932               ONE_MORE_BYTE (c1);
1933               goto label_escape_sequence;
1934
1935             case '$':           /* designation of 2-byte character set */
1936               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1937                 goto label_invalid_code;
1938               ONE_MORE_BYTE (c1);
1939               if (c1 >= '@' && c1 <= 'B')
1940                 {       /* designation of JISX0208.1978, GB2312.1980,
1941                            or JISX0208.1980 */
1942                   DECODE_DESIGNATION (0, 2, 94, c1);
1943                 }
1944               else if (c1 >= 0x28 && c1 <= 0x2B)
1945                 {       /* designation of DIMENSION2_CHARS94 character set */
1946                   ONE_MORE_BYTE (c2);
1947                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1948                 }
1949               else if (c1 >= 0x2C && c1 <= 0x2F)
1950                 {       /* designation of DIMENSION2_CHARS96 character set */
1951                   ONE_MORE_BYTE (c2);
1952                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1953                 }
1954               else
1955                 goto label_invalid_code;
1956               /* We must update these variables now.  */
1957               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1958               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1959               continue;
1960
1961             case 'n':           /* invocation of locking-shift-2 */
1962               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1963                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1964                 goto label_invalid_code;
1965               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1966               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1967               continue;
1968
1969             case 'o':           /* invocation of locking-shift-3 */
1970               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1971                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1972                 goto label_invalid_code;
1973               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1974               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1975               continue;
1976
1977             case 'N':           /* invocation of single-shift-2 */
1978               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1979                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1980                 goto label_invalid_code;
1981               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1982               ONE_MORE_BYTE (c1);
1983               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1984                 goto label_invalid_code;
1985               break;
1986
1987             case 'O':           /* invocation of single-shift-3 */
1988               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1989                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1990                 goto label_invalid_code;
1991               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1992               ONE_MORE_BYTE (c1);
1993               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1994                 goto label_invalid_code;
1995               break;
1996
1997             case '0': case '2': case '3': case '4': /* start composition */
1998               DECODE_COMPOSITION_START (c1);
1999               continue;
2000
2001             case '1':           /* end composition */
2002               DECODE_COMPOSITION_END (c1);
2003               continue;
2004
2005             case '[':           /* specification of direction */
2006               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2007                 goto label_invalid_code;
2008               /* For the moment, nested direction is not supported.
2009                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2010                  left-to-right, and nonzero means right-to-left.  */
2011               ONE_MORE_BYTE (c1);
2012               switch (c1)
2013                 {
2014                 case ']':       /* end of the current direction */
2015                   coding->mode &= ~CODING_MODE_DIRECTION;
2016
2017                 case '0':       /* end of the current direction */
2018                 case '1':       /* start of left-to-right direction */
2019                   ONE_MORE_BYTE (c1);
2020                   if (c1 == ']')
2021                     coding->mode &= ~CODING_MODE_DIRECTION;
2022                   else
2023                     goto label_invalid_code;
2024                   break;
2025
2026                 case '2':       /* start of right-to-left direction */
2027                   ONE_MORE_BYTE (c1);
2028                   if (c1 == ']')
2029                     coding->mode |= CODING_MODE_DIRECTION;
2030                   else
2031                     goto label_invalid_code;
2032                   break;
2033
2034                 default:
2035                   goto label_invalid_code;
2036                 }
2037               continue;
2038
2039             case '%':
2040               if (COMPOSING_P (coding))
2041                 DECODE_COMPOSITION_END ('1');
2042               ONE_MORE_BYTE (c1);
2043               if (c1 == '/')
2044                 {
2045                   /* CTEXT extended segment:
2046                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2047                      We keep these bytes as is for the moment.
2048                      They may be decoded by post-read-conversion.  */
2049                   int dim, M, L;
2050                   int size, required;
2051                   int produced_chars;
2052
2053                   ONE_MORE_BYTE (dim);
2054                   ONE_MORE_BYTE (M);
2055                   ONE_MORE_BYTE (L);
2056                   size = ((M - 128) * 128) + (L - 128);
2057                   required = 8 + size * 2;
2058                   if (dst + required > (dst_bytes ? dst_end : src))
2059                     goto label_end_of_loop;
2060                   *dst++ = ISO_CODE_ESC;
2061                   *dst++ = '%';
2062                   *dst++ = '/';
2063                   *dst++ = dim;
2064                   produced_chars = 4;
2065                   dst += CHAR_STRING (M, dst), produced_chars++;
2066                   dst += CHAR_STRING (L, dst), produced_chars++;
2067                   while (size-- > 0)
2068                     {
2069                       ONE_MORE_BYTE (c1);
2070                       dst += CHAR_STRING (c1, dst), produced_chars++;
2071                     }
2072                   coding->produced_char += produced_chars;
2073                 }
2074               else if (c1 == 'G')
2075                 {
2076                   unsigned char *d = dst;
2077                   int produced_chars;
2078
2079                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2080                      ESC % G --UTF-8-BYTES-- ESC % @
2081                      We keep these bytes as is for the moment.
2082                      They may be decoded by post-read-conversion.  */
2083                   if (d + 6 > (dst_bytes ? dst_end : src))
2084                     goto label_end_of_loop;
2085                   *d++ = ISO_CODE_ESC;
2086                   *d++ = '%';
2087                   *d++ = 'G';
2088                   produced_chars = 3;
2089                   while (d + 1 < (dst_bytes ? dst_end : src))
2090                     {
2091                       ONE_MORE_BYTE (c1);
2092                       if (c1 == ISO_CODE_ESC
2093                           && src + 1 < src_end
2094                           && src[0] == '%'
2095                           && src[1] == '@')
2096                         break;
2097                       d += CHAR_STRING (c1, d), produced_chars++;
2098                     }
2099                   if (d + 3 > (dst_bytes ? dst_end : src))
2100                     goto label_end_of_loop;
2101                   *d++ = ISO_CODE_ESC;
2102                   *d++ = '%';
2103                   *d++ = '@';
2104                   dst = d;
2105                   coding->produced_char += produced_chars + 3;
2106                 }
2107               else
2108                 goto label_invalid_code;
2109               continue;
2110
2111             default:
2112               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2113                 goto label_invalid_code;
2114               if (c1 >= 0x28 && c1 <= 0x2B)
2115                 {       /* designation of DIMENSION1_CHARS94 character set */
2116                   ONE_MORE_BYTE (c2);
2117                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2118                 }
2119               else if (c1 >= 0x2C && c1 <= 0x2F)
2120                 {       /* designation of DIMENSION1_CHARS96 character set */
2121                   ONE_MORE_BYTE (c2);
2122                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2123                 }
2124               else
2125                 goto label_invalid_code;
2126               /* We must update these variables now.  */
2127               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2128               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2129               continue;
2130             }
2131         }
2132
2133       /* Now we know CHARSET and 1st position code C1 of a character.
2134          Produce a multibyte sequence for that character while getting
2135          2nd position code C2 if necessary.  */
2136       if (CHARSET_DIMENSION (charset) == 2)
2137         {
2138           ONE_MORE_BYTE (c2);
2139           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2140             /* C2 is not in a valid range.  */
2141             goto label_invalid_code;
2142         }
2143       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2144       EMIT_CHAR (c);
2145       continue;
2146
2147     label_invalid_code:
2148       coding->errors++;
2149       if (COMPOSING_P (coding))
2150         DECODE_COMPOSITION_END ('1');
2151       src = src_base;
2152       c = *src++;
2153       EMIT_CHAR (c);
2154     }
2155
2156  label_end_of_loop:
2157   coding->consumed = coding->consumed_char = src_base - source;
2158   coding->produced = dst - destination;
2159   return;
2160 }
2161
2162
2163 /* ISO2022 encoding stuff.  */
2164
2165 /*
2166    It is not enough to say just "ISO2022" on encoding, we have to
2167    specify more details.  In Emacs, each ISO2022 coding system
2168    variant has the following specifications:
2169         1. Initial designation to G0 through G3.
2170         2. Allows short-form designation?
2171         3. ASCII should be designated to G0 before control characters?
2172         4. ASCII should be designated to G0 at end of line?
2173         5. 7-bit environment or 8-bit environment?
2174         6. Use locking-shift?
2175         7. Use Single-shift?
2176    And the following two are only for Japanese:
2177         8. Use ASCII in place of JIS0201-1976-Roman?
2178         9. Use JISX0208-1983 in place of JISX0208-1978?
2179    These specifications are encoded in `coding->flags' as flag bits
2180    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2181    details.
2182 */
2183
2184 /* Produce codes (escape sequence) for designating CHARSET to graphic
2185    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2186    '@', 'A', or 'B' and the coding system CODING allows, produce
2187    designation sequence of short-form.  */
2188
2189 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2190   do {                                                                  \
2191     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2192     char *intermediate_char_94 = "()*+";                                \
2193     char *intermediate_char_96 = ",-./";                                \
2194     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2195                                                                         \
2196     if (revision < 255)                                                 \
2197       {                                                                 \
2198         *dst++ = ISO_CODE_ESC;                                          \
2199         *dst++ = '&';                                                   \
2200         *dst++ = '@' + revision;                                        \
2201       }                                                                 \
2202     *dst++ = ISO_CODE_ESC;                                              \
2203     if (CHARSET_DIMENSION (charset) == 1)                               \
2204       {                                                                 \
2205         if (CHARSET_CHARS (charset) == 94)                              \
2206           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2207         else                                                            \
2208           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2209       }                                                                 \
2210     else                                                                \
2211       {                                                                 \
2212         *dst++ = '$';                                                   \
2213         if (CHARSET_CHARS (charset) == 94)                              \
2214           {                                                             \
2215             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2216                 || reg != 0                                             \
2217                 || final_char < '@' || final_char > 'B')                \
2218               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2219           }                                                             \
2220         else                                                            \
2221           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2222       }                                                                 \
2223     *dst++ = final_char;                                                \
2224     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2225   } while (0)
2226
2227 /* The following two macros produce codes (control character or escape
2228    sequence) for ISO2022 single-shift functions (single-shift-2 and
2229    single-shift-3).  */
2230
2231 #define ENCODE_SINGLE_SHIFT_2                           \
2232   do {                                                  \
2233     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2234       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2235     else                                                \
2236       *dst++ = ISO_CODE_SS2;                            \
2237     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2238   } while (0)
2239
2240 #define ENCODE_SINGLE_SHIFT_3                           \
2241   do {                                                  \
2242     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2243       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2244     else                                                \
2245       *dst++ = ISO_CODE_SS3;                            \
2246     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2247   } while (0)
2248
2249 /* The following four macros produce codes (control character or
2250    escape sequence) for ISO2022 locking-shift functions (shift-in,
2251    shift-out, locking-shift-2, and locking-shift-3).  */
2252
2253 #define ENCODE_SHIFT_IN                         \
2254   do {                                          \
2255     *dst++ = ISO_CODE_SI;                       \
2256     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2257   } while (0)
2258
2259 #define ENCODE_SHIFT_OUT                        \
2260   do {                                          \
2261     *dst++ = ISO_CODE_SO;                       \
2262     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2263   } while (0)
2264
2265 #define ENCODE_LOCKING_SHIFT_2                  \
2266   do {                                          \
2267     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2268     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2269   } while (0)
2270
2271 #define ENCODE_LOCKING_SHIFT_3                  \
2272   do {                                          \
2273     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2274     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2275   } while (0)
2276
2277 /* Produce codes for a DIMENSION1 character whose character set is
2278    CHARSET and whose position-code is C1.  Designation and invocation
2279    sequences are also produced in advance if necessary.  */
2280
2281 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2282   do {                                                                  \
2283     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2284       {                                                                 \
2285         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2286           *dst++ = c1 & 0x7F;                                           \
2287         else                                                            \
2288           *dst++ = c1 | 0x80;                                           \
2289         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2290         break;                                                          \
2291       }                                                                 \
2292     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2293       {                                                                 \
2294         *dst++ = c1 & 0x7F;                                             \
2295         break;                                                          \
2296       }                                                                 \
2297     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2298       {                                                                 \
2299         *dst++ = c1 | 0x80;                                             \
2300         break;                                                          \
2301       }                                                                 \
2302     else                                                                \
2303       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2304          must invoke it, or, at first, designate it to some graphic     \
2305          register.  Then repeat the loop to actually produce the        \
2306          character.  */                                                 \
2307       dst = encode_invocation_designation (charset, coding, dst);       \
2308   } while (1)
2309
2310 /* Produce codes for a DIMENSION2 character whose character set is
2311    CHARSET and whose position-codes are C1 and C2.  Designation and
2312    invocation codes are also produced in advance if necessary.  */
2313
2314 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2315   do {                                                                  \
2316     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2317       {                                                                 \
2318         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2319           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2320         else                                                            \
2321           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2322         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2323         break;                                                          \
2324       }                                                                 \
2325     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2326       {                                                                 \
2327         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2328         break;                                                          \
2329       }                                                                 \
2330     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2331       {                                                                 \
2332         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2333         break;                                                          \
2334       }                                                                 \
2335     else                                                                \
2336       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2337          must invoke it, or, at first, designate it to some graphic     \
2338          register.  Then repeat the loop to actually produce the        \
2339          character.  */                                                 \
2340       dst = encode_invocation_designation (charset, coding, dst);       \
2341   } while (1)
2342
2343 #define ENCODE_ISO_CHARACTER(c)                                 \
2344   do {                                                          \
2345     int charset, c1, c2;                                        \
2346                                                                 \
2347     SPLIT_CHAR (c, charset, c1, c2);                            \
2348     if (CHARSET_DEFINED_P (charset))                            \
2349       {                                                         \
2350         if (CHARSET_DIMENSION (charset) == 1)                   \
2351           {                                                     \
2352             if (charset == CHARSET_ASCII                        \
2353                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2354               charset = charset_latin_jisx0201;                 \
2355             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2356           }                                                     \
2357         else                                                    \
2358           {                                                     \
2359             if (charset == charset_jisx0208                     \
2360                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2361               charset = charset_jisx0208_1978;                  \
2362             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2363           }                                                     \
2364       }                                                         \
2365     else                                                        \
2366       {                                                         \
2367         *dst++ = c1;                                            \
2368         if (c2 >= 0)                                            \
2369           *dst++ = c2;                                          \
2370       }                                                         \
2371   } while (0)
2372
2373
2374 /* Instead of encoding character C, produce one or two `?'s.  */
2375
2376 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2377   do {                                                          \
2378     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2379     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2380       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2381   } while (0)
2382
2383
2384 /* Produce designation and invocation codes at a place pointed by DST
2385    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2386    Return new DST.  */
2387
2388 unsigned char *
2389 encode_invocation_designation (charset, coding, dst)
2390      int charset;
2391      struct coding_system *coding;
2392      unsigned char *dst;
2393 {
2394   int reg;                      /* graphic register number */
2395
2396   /* At first, check designations.  */
2397   for (reg = 0; reg < 4; reg++)
2398     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2399       break;
2400
2401   if (reg >= 4)
2402     {
2403       /* CHARSET is not yet designated to any graphic registers.  */
2404       /* At first check the requested designation.  */
2405       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2406       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2407         /* Since CHARSET requests no special designation, designate it
2408            to graphic register 0.  */
2409         reg = 0;
2410
2411       ENCODE_DESIGNATION (charset, reg, coding);
2412     }
2413
2414   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2415       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2416     {
2417       /* Since the graphic register REG is not invoked to any graphic
2418          planes, invoke it to graphic plane 0.  */
2419       switch (reg)
2420         {
2421         case 0:                 /* graphic register 0 */
2422           ENCODE_SHIFT_IN;
2423           break;
2424
2425         case 1:                 /* graphic register 1 */
2426           ENCODE_SHIFT_OUT;
2427           break;
2428
2429         case 2:                 /* graphic register 2 */
2430           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2431             ENCODE_SINGLE_SHIFT_2;
2432           else
2433             ENCODE_LOCKING_SHIFT_2;
2434           break;
2435
2436         case 3:                 /* graphic register 3 */
2437           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2438             ENCODE_SINGLE_SHIFT_3;
2439           else
2440             ENCODE_LOCKING_SHIFT_3;
2441           break;
2442         }
2443     }
2444
2445   return dst;
2446 }
2447
2448 /* Produce 2-byte codes for encoded composition rule RULE.  */
2449
2450 #define ENCODE_COMPOSITION_RULE(rule)           \
2451   do {                                          \
2452     int gref, nref;                             \
2453     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2454     *dst++ = 32 + 81 + gref;                    \
2455     *dst++ = 32 + nref;                         \
2456   } while (0)
2457
2458 /* Produce codes for indicating the start of a composition sequence
2459    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2460    which specify information about the composition.  See the comment
2461    in coding.h for the format of DATA.  */
2462
2463 #define ENCODE_COMPOSITION_START(coding, data)                          \
2464   do {                                                                  \
2465     coding->composing = data[3];                                        \
2466     *dst++ = ISO_CODE_ESC;                                              \
2467     if (coding->composing == COMPOSITION_RELATIVE)                      \
2468       *dst++ = '0';                                                     \
2469     else                                                                \
2470       {                                                                 \
2471         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2472                   ? '3' : '4');                                         \
2473         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2474         coding->composition_rule_follows = 0;                           \
2475       }                                                                 \
2476   } while (0)
2477
2478 /* Produce codes for indicating the end of the current composition.  */
2479
2480 #define ENCODE_COMPOSITION_END(coding, data)                    \
2481   do {                                                          \
2482     *dst++ = ISO_CODE_ESC;                                      \
2483     *dst++ = '1';                                               \
2484     coding->cmp_data_start += data[0];                          \
2485     coding->composing = COMPOSITION_NO;                         \
2486     if (coding->cmp_data_start == coding->cmp_data->used        \
2487         && coding->cmp_data->next)                              \
2488       {                                                         \
2489         coding->cmp_data = coding->cmp_data->next;              \
2490         coding->cmp_data_start = 0;                             \
2491       }                                                         \
2492   } while (0)
2493
2494 /* Produce composition start sequence ESC 0.  Here, this sequence
2495    doesn't mean the start of a new composition but means that we have
2496    just produced components (alternate chars and composition rules) of
2497    the composition and the actual text follows in SRC.  */
2498
2499 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2500   do {                                          \
2501     *dst++ = ISO_CODE_ESC;                      \
2502     *dst++ = '0';                               \
2503     coding->composing = COMPOSITION_RELATIVE;   \
2504   } while (0)
2505
2506 /* The following three macros produce codes for indicating direction
2507    of text.  */
2508 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2509   do {                                                  \
2510     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2511       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2512     else                                                \
2513       *dst++ = ISO_CODE_CSI;                            \
2514   } while (0)
2515
2516 #define ENCODE_DIRECTION_R2L    \
2517   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2518
2519 #define ENCODE_DIRECTION_L2R    \
2520   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2521
2522 /* Produce codes for designation and invocation to reset the graphic
2523    planes and registers to initial state.  */
2524 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2525   do {                                                                      \
2526     int reg;                                                                \
2527     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2528       ENCODE_SHIFT_IN;                                                      \
2529     for (reg = 0; reg < 4; reg++)                                           \
2530       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2531           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2532               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2533         ENCODE_DESIGNATION                                                  \
2534           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2535   } while (0)
2536
2537 /* Produce designation sequences of charsets in the line started from
2538    SRC to a place pointed by DST, and return updated DST.
2539
2540    If the current block ends before any end-of-line, we may fail to
2541    find all the necessary designations.  */
2542
2543 static unsigned char *
2544 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2545      struct coding_system *coding;
2546      Lisp_Object translation_table;
2547      unsigned char *src, *src_end, *dst;
2548 {
2549   int charset, c, found = 0, reg;
2550   /* Table of charsets to be designated to each graphic register.  */
2551   int r[4];
2552
2553   for (reg = 0; reg < 4; reg++)
2554     r[reg] = -1;
2555
2556   while (found < 4)
2557     {
2558       ONE_MORE_CHAR (c);
2559       if (c == '\n')
2560         break;
2561
2562       charset = CHAR_CHARSET (c);
2563       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2564       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2565         {
2566           found++;
2567           r[reg] = charset;
2568         }
2569     }
2570
2571  label_end_of_loop:
2572   if (found)
2573     {
2574       for (reg = 0; reg < 4; reg++)
2575         if (r[reg] >= 0
2576             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2577           ENCODE_DESIGNATION (r[reg], reg, coding);
2578     }
2579
2580   return dst;
2581 }
2582
2583 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2584
2585 static void
2586 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2587      struct coding_system *coding;
2588      unsigned char *source, *destination;
2589      int src_bytes, dst_bytes;
2590 {
2591   unsigned char *src = source;
2592   unsigned char *src_end = source + src_bytes;
2593   unsigned char *dst = destination;
2594   unsigned char *dst_end = destination + dst_bytes;
2595   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2596      from DST_END to assure overflow checking is necessary only at the
2597      head of loop.  */
2598   unsigned char *adjusted_dst_end = dst_end - 19;
2599   /* SRC_BASE remembers the start position in source in each loop.
2600      The loop will be exited when there's not enough source text to
2601      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2602      there's not enough destination area to produce encoded codes
2603      (within macro EMIT_BYTES).  */
2604   unsigned char *src_base;
2605   int c;
2606   Lisp_Object translation_table;
2607   Lisp_Object safe_chars;
2608
2609   if (coding->flags & CODING_FLAG_ISO_SAFE)
2610     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2611
2612   safe_chars = coding_safe_chars (coding->symbol);
2613
2614   if (NILP (Venable_character_translation))
2615     translation_table = Qnil;
2616   else
2617     {
2618       translation_table = coding->translation_table_for_encode;
2619       if (NILP (translation_table))
2620         translation_table = Vstandard_translation_table_for_encode;
2621     }
2622
2623   coding->consumed_char = 0;
2624   coding->errors = 0;
2625   while (1)
2626     {
2627       src_base = src;
2628
2629       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2630         {
2631           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2632           break;
2633         }
2634
2635       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2636           && CODING_SPEC_ISO_BOL (coding))
2637         {
2638           /* We have to produce designation sequences if any now.  */
2639           dst = encode_designation_at_bol (coding, translation_table,
2640                                            src, src_end, dst);
2641           CODING_SPEC_ISO_BOL (coding) = 0;
2642         }
2643
2644       /* Check composition start and end.  */
2645       if (coding->composing != COMPOSITION_DISABLED
2646           && coding->cmp_data_start < coding->cmp_data->used)
2647         {
2648           struct composition_data *cmp_data = coding->cmp_data;
2649           int *data = cmp_data->data + coding->cmp_data_start;
2650           int this_pos = cmp_data->char_offset + coding->consumed_char;
2651
2652           if (coding->composing == COMPOSITION_RELATIVE)
2653             {
2654               if (this_pos == data[2])
2655                 {
2656                   ENCODE_COMPOSITION_END (coding, data);
2657                   cmp_data = coding->cmp_data;
2658                   data = cmp_data->data + coding->cmp_data_start;
2659                 }
2660             }
2661           else if (COMPOSING_P (coding))
2662             {
2663               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2664               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2665                 /* We have consumed components of the composition.
2666                    What follows in SRC is the composition's base
2667                    text.  */
2668                 ENCODE_COMPOSITION_FAKE_START (coding);
2669               else
2670                 {
2671                   int c = cmp_data->data[coding->cmp_data_index++];
2672                   if (coding->composition_rule_follows)
2673                     {
2674                       ENCODE_COMPOSITION_RULE (c);
2675                       coding->composition_rule_follows = 0;
2676                     }
2677                   else
2678                     {
2679                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2680                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2681                         ENCODE_UNSAFE_CHARACTER (c);
2682                       else
2683                         ENCODE_ISO_CHARACTER (c);
2684                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2685                         coding->composition_rule_follows = 1;
2686                     }
2687                   continue;
2688                 }
2689             }
2690           if (!COMPOSING_P (coding))
2691             {
2692               if (this_pos == data[1])
2693                 {
2694                   ENCODE_COMPOSITION_START (coding, data);
2695                   continue;
2696                 }
2697             }
2698         }
2699
2700       ONE_MORE_CHAR (c);
2701
2702       /* Now encode the character C.  */
2703       if (c < 0x20 || c == 0x7F)
2704         {
2705           if (c == '\r')
2706             {
2707               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2708                 {
2709                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2710                     ENCODE_RESET_PLANE_AND_REGISTER;
2711                   *dst++ = c;
2712                   continue;
2713                 }
2714               /* fall down to treat '\r' as '\n' ...  */
2715               c = '\n';
2716             }
2717           if (c == '\n')
2718             {
2719               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2720                 ENCODE_RESET_PLANE_AND_REGISTER;
2721               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2722                 bcopy (coding->spec.iso2022.initial_designation,
2723                        coding->spec.iso2022.current_designation,
2724                        sizeof coding->spec.iso2022.initial_designation);
2725               if (coding->eol_type == CODING_EOL_LF
2726                   || coding->eol_type == CODING_EOL_UNDECIDED)
2727                 *dst++ = ISO_CODE_LF;
2728               else if (coding->eol_type == CODING_EOL_CRLF)
2729                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2730               else
2731                 *dst++ = ISO_CODE_CR;
2732               CODING_SPEC_ISO_BOL (coding) = 1;
2733             }
2734           else
2735             {
2736               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2737                 ENCODE_RESET_PLANE_AND_REGISTER;
2738               *dst++ = c;
2739             }
2740         }
2741       else if (ASCII_BYTE_P (c))
2742         ENCODE_ISO_CHARACTER (c);
2743       else if (SINGLE_BYTE_CHAR_P (c))
2744         {
2745           *dst++ = c;
2746           coding->errors++;
2747         }
2748       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2749                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2750         ENCODE_UNSAFE_CHARACTER (c);
2751       else
2752         ENCODE_ISO_CHARACTER (c);
2753
2754       coding->consumed_char++;
2755     }
2756
2757  label_end_of_loop:
2758   coding->consumed = src_base - source;
2759   coding->produced = coding->produced_char = dst - destination;
2760 }
2761
2762 \f
2763 /*** 4. SJIS and BIG5 handlers ***/
2764
2765 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2766    quite widely.  So, for the moment, Emacs supports them in the bare
2767    C code.  But, in the future, they may be supported only by CCL.  */
2768
2769 /* SJIS is a coding system encoding three character sets: ASCII, right
2770    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2771    as is.  A character of charset katakana-jisx0201 is encoded by
2772    "position-code + 0x80".  A character of charset japanese-jisx0208
2773    is encoded in 2-byte but two position-codes are divided and shifted
2774    so that it fits in the range below.
2775
2776    --- CODE RANGE of SJIS ---
2777    (character set)      (range)
2778    ASCII                0x00 .. 0x7F
2779    KATAKANA-JISX0201    0xA1 .. 0xDF
2780    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2781             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2782    -------------------------------
2783
2784 */
2785
2786 /* BIG5 is a coding system encoding two character sets: ASCII and
2787    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2788    character set and is encoded in two bytes.
2789
2790    --- CODE RANGE of BIG5 ---
2791    (character set)      (range)
2792    ASCII                0x00 .. 0x7F
2793    Big5 (1st byte)      0xA1 .. 0xFE
2794         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2795    --------------------------
2796
2797    Since the number of characters in Big5 is larger than maximum
2798    characters in Emacs' charset (96x96), it can't be handled as one
2799    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2800    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2801    contains frequently used characters and the latter contains less
2802    frequently used characters.  */
2803
2804 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2805    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2806    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2807    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2808
2809 /* Number of Big5 characters which have the same code in 1st byte.  */
2810 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2811
2812 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2813   do {                                                                  \
2814     unsigned int temp                                                   \
2815       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2816     if (b1 < 0xC9)                                                      \
2817       charset = charset_big5_1;                                         \
2818     else                                                                \
2819       {                                                                 \
2820         charset = charset_big5_2;                                       \
2821         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2822       }                                                                 \
2823     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2824     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2825   } while (0)
2826
2827 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2828   do {                                                                  \
2829     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2830     if (charset == charset_big5_2)                                      \
2831       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2832     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2833     b2 = temp % BIG5_SAME_ROW;                                          \
2834     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2835   } while (0)
2836
2837 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2838    Check if a text is encoded in SJIS.  If it is, return
2839    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2840
2841 static int
2842 detect_coding_sjis (src, src_end, multibytep)
2843      unsigned char *src, *src_end;
2844      int multibytep;
2845 {
2846   int c;
2847   /* Dummy for ONE_MORE_BYTE.  */
2848   struct coding_system dummy_coding;
2849   struct coding_system *coding = &dummy_coding;
2850
2851   while (1)
2852     {
2853       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2854       if (c < 0x80)
2855         continue;
2856       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2857         return 0;
2858       if (c <= 0x9F || c >= 0xE0)
2859         {
2860           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2861           if (c < 0x40 || c == 0x7F || c > 0xFC)
2862             return 0;
2863         }
2864     }
2865  label_end_of_loop:
2866   return CODING_CATEGORY_MASK_SJIS;
2867 }
2868
2869 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2870    Check if a text is encoded in BIG5.  If it is, return
2871    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2872
2873 static int
2874 detect_coding_big5 (src, src_end, multibytep)
2875      unsigned char *src, *src_end;
2876      int multibytep;
2877 {
2878   int c;
2879   /* Dummy for ONE_MORE_BYTE.  */
2880   struct coding_system dummy_coding;
2881   struct coding_system *coding = &dummy_coding;
2882
2883   while (1)
2884     {
2885       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2886       if (c < 0x80)
2887         continue;
2888       if (c < 0xA1 || c > 0xFE)
2889         return 0;
2890       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2891       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2892         return 0;
2893     }
2894  label_end_of_loop:
2895   return CODING_CATEGORY_MASK_BIG5;
2896 }
2897
2898 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2899    Check if a text is encoded in UTF-8.  If it is, return
2900    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2901
2902 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2903 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2904 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2905 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2906 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2907 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2908 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2909
2910 static int
2911 detect_coding_utf_8 (src, src_end, multibytep)
2912      unsigned char *src, *src_end;
2913      int multibytep;
2914 {
2915   unsigned char c;
2916   int seq_maybe_bytes;
2917   /* Dummy for ONE_MORE_BYTE.  */
2918   struct coding_system dummy_coding;
2919   struct coding_system *coding = &dummy_coding;
2920
2921   while (1)
2922     {
2923       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2924       if (UTF_8_1_OCTET_P (c))
2925         continue;
2926       else if (UTF_8_2_OCTET_LEADING_P (c))
2927         seq_maybe_bytes = 1;
2928       else if (UTF_8_3_OCTET_LEADING_P (c))
2929         seq_maybe_bytes = 2;
2930       else if (UTF_8_4_OCTET_LEADING_P (c))
2931         seq_maybe_bytes = 3;
2932       else if (UTF_8_5_OCTET_LEADING_P (c))
2933         seq_maybe_bytes = 4;
2934       else if (UTF_8_6_OCTET_LEADING_P (c))
2935         seq_maybe_bytes = 5;
2936       else
2937         return 0;
2938
2939       do
2940         {
2941           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2942           if (!UTF_8_EXTRA_OCTET_P (c))
2943             return 0;
2944           seq_maybe_bytes--;
2945         }
2946       while (seq_maybe_bytes > 0);
2947     }
2948
2949  label_end_of_loop:
2950   return CODING_CATEGORY_MASK_UTF_8;
2951 }
2952
2953 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2954    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2955    Little Endian (otherwise).  If it is, return
2956    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2957    else return 0.  */
2958
2959 #define UTF_16_INVALID_P(val)   \
2960   (((val) == 0xFFFE)            \
2961    || ((val) == 0xFFFF))
2962
2963 #define UTF_16_HIGH_SURROGATE_P(val) \
2964   (((val) & 0xD800) == 0xD800)
2965
2966 #define UTF_16_LOW_SURROGATE_P(val) \
2967   (((val) & 0xDC00) == 0xDC00)
2968
2969 static int
2970 detect_coding_utf_16 (src, src_end, multibytep)
2971      unsigned char *src, *src_end;
2972      int multibytep;
2973 {
2974   unsigned char c1, c2;
2975   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
2976   struct coding_system dummy_coding;
2977   struct coding_system *coding = &dummy_coding;
2978
2979   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2980   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
2981
2982   if ((c1 == 0xFF) && (c2 == 0xFE))
2983     return CODING_CATEGORY_MASK_UTF_16_LE;
2984   else if ((c1 == 0xFE) && (c2 == 0xFF))
2985     return CODING_CATEGORY_MASK_UTF_16_BE;
2986
2987  label_end_of_loop:
2988   return 0;
2989 }
2990
2991 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2992    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2993
2994 static void
2995 decode_coding_sjis_big5 (coding, source, destination,
2996                          src_bytes, dst_bytes, sjis_p)
2997      struct coding_system *coding;
2998      unsigned char *source, *destination;
2999      int src_bytes, dst_bytes;
3000      int sjis_p;
3001 {
3002   unsigned char *src = source;
3003   unsigned char *src_end = source + src_bytes;
3004   unsigned char *dst = destination;
3005   unsigned char *dst_end = destination + dst_bytes;
3006   /* SRC_BASE remembers the start position in source in each loop.
3007      The loop will be exited when there's not enough source code
3008      (within macro ONE_MORE_BYTE), or when there's not enough
3009      destination area to produce a character (within macro
3010      EMIT_CHAR).  */
3011   unsigned char *src_base;
3012   Lisp_Object translation_table;
3013
3014   if (NILP (Venable_character_translation))
3015     translation_table = Qnil;
3016   else
3017     {
3018       translation_table = coding->translation_table_for_decode;
3019       if (NILP (translation_table))
3020         translation_table = Vstandard_translation_table_for_decode;
3021     }
3022
3023   coding->produced_char = 0;
3024   while (1)
3025     {
3026       int c, charset, c1, c2;
3027
3028       src_base = src;
3029       ONE_MORE_BYTE (c1);
3030
3031       if (c1 < 0x80)
3032         {
3033           charset = CHARSET_ASCII;
3034           if (c1 < 0x20)
3035             {
3036               if (c1 == '\r')
3037                 {
3038                   if (coding->eol_type == CODING_EOL_CRLF)
3039                     {
3040                       ONE_MORE_BYTE (c2);
3041                       if (c2 == '\n')
3042                         c1 = c2;
3043                       else
3044                         /* To process C2 again, SRC is subtracted by 1.  */
3045                         src--;
3046                     }
3047                   else if (coding->eol_type == CODING_EOL_CR)
3048                     c1 = '\n';
3049                 }
3050               else if (c1 == '\n'
3051                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3052                        && (coding->eol_type == CODING_EOL_CR
3053                            || coding->eol_type == CODING_EOL_CRLF))
3054                 {
3055                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3056                   goto label_end_of_loop;
3057                 }
3058             }
3059         }
3060       else
3061         {
3062           if (sjis_p)
3063             {
3064               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3065                 goto label_invalid_code;
3066               if (c1 <= 0x9F || c1 >= 0xE0)
3067                 {
3068                   /* SJIS -> JISX0208 */
3069                   ONE_MORE_BYTE (c2);
3070                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3071                     goto label_invalid_code;
3072                   DECODE_SJIS (c1, c2, c1, c2);
3073                   charset = charset_jisx0208;
3074                 }
3075               else
3076                 /* SJIS -> JISX0201-Kana */
3077                 charset = charset_katakana_jisx0201;
3078             }
3079           else
3080             {
3081               /* BIG5 -> Big5 */
3082               if (c1 < 0xA0 || c1 > 0xFE)
3083                 goto label_invalid_code;
3084               ONE_MORE_BYTE (c2);
3085               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3086                 goto label_invalid_code;
3087               DECODE_BIG5 (c1, c2, charset, c1, c2);
3088             }
3089         }
3090
3091       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3092       EMIT_CHAR (c);
3093       continue;
3094
3095     label_invalid_code:
3096       coding->errors++;
3097       src = src_base;
3098       c = *src++;
3099       EMIT_CHAR (c);
3100     }
3101
3102  label_end_of_loop:
3103   coding->consumed = coding->consumed_char = src_base - source;
3104   coding->produced = dst - destination;
3105   return;
3106 }
3107
3108 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3109    This function can encode charsets `ascii', `katakana-jisx0201',
3110    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3111    are sure that all these charsets are registered as official charset
3112    (i.e. do not have extended leading-codes).  Characters of other
3113    charsets are produced without any encoding.  If SJIS_P is 1, encode
3114    SJIS text, else encode BIG5 text.  */
3115
3116 static void
3117 encode_coding_sjis_big5 (coding, source, destination,
3118                          src_bytes, dst_bytes, sjis_p)
3119      struct coding_system *coding;
3120      unsigned char *source, *destination;
3121      int src_bytes, dst_bytes;
3122      int sjis_p;
3123 {
3124   unsigned char *src = source;
3125   unsigned char *src_end = source + src_bytes;
3126   unsigned char *dst = destination;
3127   unsigned char *dst_end = destination + dst_bytes;
3128   /* SRC_BASE remembers the start position in source in each loop.
3129      The loop will be exited when there's not enough source text to
3130      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3131      there's not enough destination area to produce encoded codes
3132      (within macro EMIT_BYTES).  */
3133   unsigned char *src_base;
3134   Lisp_Object translation_table;
3135
3136   if (NILP (Venable_character_translation))
3137     translation_table = Qnil;
3138   else
3139     {
3140       translation_table = coding->translation_table_for_encode;
3141       if (NILP (translation_table))
3142         translation_table = Vstandard_translation_table_for_encode;
3143     }
3144
3145   while (1)
3146     {
3147       int c, charset, c1, c2;
3148
3149       src_base = src;
3150       ONE_MORE_CHAR (c);
3151
3152       /* Now encode the character C.  */
3153       if (SINGLE_BYTE_CHAR_P (c))
3154         {
3155           switch (c)
3156             {
3157             case '\r':
3158               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3159                 {
3160                   EMIT_ONE_BYTE (c);
3161                   break;
3162                 }
3163               c = '\n';
3164             case '\n':
3165               if (coding->eol_type == CODING_EOL_CRLF)
3166                 {
3167                   EMIT_TWO_BYTES ('\r', c);
3168                   break;
3169                 }
3170               else if (coding->eol_type == CODING_EOL_CR)
3171                 c = '\r';
3172             default:
3173               EMIT_ONE_BYTE (c);
3174             }
3175         }
3176       else
3177         {
3178           SPLIT_CHAR (c, charset, c1, c2);
3179           if (sjis_p)
3180             {
3181               if (charset == charset_jisx0208
3182                   || charset == charset_jisx0208_1978)
3183                 {
3184                   ENCODE_SJIS (c1, c2, c1, c2);
3185                   EMIT_TWO_BYTES (c1, c2);
3186                 }
3187               else if (charset == charset_katakana_jisx0201)
3188                 EMIT_ONE_BYTE (c1 | 0x80);
3189               else if (charset == charset_latin_jisx0201)
3190                 EMIT_ONE_BYTE (c1);
3191               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3192                 {
3193                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3194                   if (CHARSET_WIDTH (charset) > 1)
3195                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3196                 }
3197               else
3198                 /* There's no way other than producing the internal
3199                    codes as is.  */
3200                 EMIT_BYTES (src_base, src);
3201             }
3202           else
3203             {
3204               if (charset == charset_big5_1 || charset == charset_big5_2)
3205                 {
3206                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3207                   EMIT_TWO_BYTES (c1, c2);
3208                 }
3209               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3210                 {
3211                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3212                   if (CHARSET_WIDTH (charset) > 1)
3213                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3214                 }
3215               else
3216                 /* There's no way other than producing the internal
3217                    codes as is.  */
3218                 EMIT_BYTES (src_base, src);
3219             }
3220         }
3221       coding->consumed_char++;
3222     }
3223
3224  label_end_of_loop:
3225   coding->consumed = src_base - source;
3226   coding->produced = coding->produced_char = dst - destination;
3227 }
3228
3229 \f
3230 /*** 5. CCL handlers ***/
3231
3232 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3233    Check if a text is encoded in a coding system of which
3234    encoder/decoder are written in CCL program.  If it is, return
3235    CODING_CATEGORY_MASK_CCL, else return 0.  */
3236
3237 static int
3238 detect_coding_ccl (src, src_end, multibytep)
3239      unsigned char *src, *src_end;
3240      int multibytep;
3241 {
3242   unsigned char *valid;
3243   int c;
3244   /* Dummy for ONE_MORE_BYTE.  */
3245   struct coding_system dummy_coding;
3246   struct coding_system *coding = &dummy_coding;
3247
3248   /* No coding system is assigned to coding-category-ccl.  */
3249   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3250     return 0;
3251
3252   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3253   while (1)
3254     {
3255       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3256       if (! valid[c])
3257         return 0;
3258     }
3259  label_end_of_loop:
3260   return CODING_CATEGORY_MASK_CCL;
3261 }
3262
3263 \f
3264 /*** 6. End-of-line handlers ***/
3265
3266 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3267
3268 static void
3269 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3270      struct coding_system *coding;
3271      unsigned char *source, *destination;
3272      int src_bytes, dst_bytes;
3273 {
3274   unsigned char *src = source;
3275   unsigned char *dst = destination;
3276   unsigned char *src_end = src + src_bytes;
3277   unsigned char *dst_end = dst + dst_bytes;
3278   Lisp_Object translation_table;
3279   /* SRC_BASE remembers the start position in source in each loop.
3280      The loop will be exited when there's not enough source code
3281      (within macro ONE_MORE_BYTE), or when there's not enough
3282      destination area to produce a character (within macro
3283      EMIT_CHAR).  */
3284   unsigned char *src_base;
3285   int c;
3286
3287   translation_table = Qnil;
3288   switch (coding->eol_type)
3289     {
3290     case CODING_EOL_CRLF:
3291       while (1)
3292         {
3293           src_base = src;
3294           ONE_MORE_BYTE (c);
3295           if (c == '\r')
3296             {
3297               ONE_MORE_BYTE (c);
3298               if (c != '\n')
3299                 {
3300                   src--;
3301                   c = '\r';
3302                 }
3303             }
3304           else if (c == '\n'
3305                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3306             {
3307               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3308               goto label_end_of_loop;
3309             }
3310           EMIT_CHAR (c);
3311         }
3312       break;
3313
3314     case CODING_EOL_CR:
3315       while (1)
3316         {
3317           src_base = src;
3318           ONE_MORE_BYTE (c);
3319           if (c == '\n')
3320             {
3321               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3322                 {
3323                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3324                   goto label_end_of_loop;
3325                 }
3326             }
3327           else if (c == '\r')
3328             c = '\n';
3329           EMIT_CHAR (c);
3330         }
3331       break;
3332
3333     default:                    /* no need for EOL handling */
3334       while (1)
3335         {
3336           src_base = src;
3337           ONE_MORE_BYTE (c);
3338           EMIT_CHAR (c);
3339         }
3340     }
3341
3342  label_end_of_loop:
3343   coding->consumed = coding->consumed_char = src_base - source;
3344   coding->produced = dst - destination;
3345   return;
3346 }
3347
3348 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3349    format of end-of-line according to `coding->eol_type'.  It also
3350    convert multibyte form 8-bit characters to unibyte if
3351    CODING->src_multibyte is nonzero.  If `coding->mode &
3352    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3353    also means end-of-line.  */
3354
3355 static void
3356 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3357      struct coding_system *coding;
3358      const unsigned char *source;
3359      unsigned char *destination;
3360      int src_bytes, dst_bytes;
3361 {
3362   const unsigned char *src = source;
3363   unsigned char *dst = destination;
3364   const unsigned char *src_end = src + src_bytes;
3365   unsigned char *dst_end = dst + dst_bytes;
3366   Lisp_Object translation_table;
3367   /* SRC_BASE remembers the start position in source in each loop.
3368      The loop will be exited when there's not enough source text to
3369      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3370      there's not enough destination area to produce encoded codes
3371      (within macro EMIT_BYTES).  */
3372   const unsigned char *src_base;
3373   unsigned char *tmp;
3374   int c;
3375   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3376
3377   translation_table = Qnil;
3378   if (coding->src_multibyte
3379       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3380     {
3381       src_end--;
3382       src_bytes--;
3383       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3384     }
3385
3386   if (coding->eol_type == CODING_EOL_CRLF)
3387     {
3388       while (src < src_end)
3389         {
3390           src_base = src;
3391           c = *src++;
3392           if (c >= 0x20)
3393             EMIT_ONE_BYTE (c);
3394           else if (c == '\n' || (c == '\r' && selective_display))
3395             EMIT_TWO_BYTES ('\r', '\n');
3396           else
3397             EMIT_ONE_BYTE (c);
3398         }
3399       src_base = src;
3400     label_end_of_loop:
3401       ;
3402     }
3403   else
3404     {
3405       if (!dst_bytes || src_bytes <= dst_bytes)
3406         {
3407           safe_bcopy (src, dst, src_bytes);
3408           src_base = src_end;
3409           dst += src_bytes;
3410         }
3411       else
3412         {
3413           if (coding->src_multibyte
3414               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3415             dst_bytes--;
3416           safe_bcopy (src, dst, dst_bytes);
3417           src_base = src + dst_bytes;
3418           dst = destination + dst_bytes;
3419           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3420         }
3421       if (coding->eol_type == CODING_EOL_CR)
3422         {
3423           for (tmp = destination; tmp < dst; tmp++)
3424             if (*tmp == '\n') *tmp = '\r';
3425         }
3426       else if (selective_display)
3427         {
3428           for (tmp = destination; tmp < dst; tmp++)
3429             if (*tmp == '\r') *tmp = '\n';
3430         }
3431     }
3432   if (coding->src_multibyte)
3433     dst = destination + str_as_unibyte (destination, dst - destination);
3434
3435   coding->consumed = src_base - source;
3436   coding->produced = dst - destination;
3437   coding->produced_char = coding->produced;
3438 }
3439
3440 \f
3441 /*** 7. C library functions ***/
3442
3443 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3444    has a property `coding-system'.  The value of this property is a
3445    vector of length 5 (called the coding-vector).  Among elements of
3446    this vector, the first (element[0]) and the fifth (element[4])
3447    carry important information for decoding/encoding.  Before
3448    decoding/encoding, this information should be set in fields of a
3449    structure of type `coding_system'.
3450
3451    The value of the property `coding-system' can be a symbol of another
3452    subsidiary coding-system.  In that case, Emacs gets coding-vector
3453    from that symbol.
3454
3455    `element[0]' contains information to be set in `coding->type'.  The
3456    value and its meaning is as follows:
3457
3458    0 -- coding_type_emacs_mule
3459    1 -- coding_type_sjis
3460    2 -- coding_type_iso2022
3461    3 -- coding_type_big5
3462    4 -- coding_type_ccl encoder/decoder written in CCL
3463    nil -- coding_type_no_conversion
3464    t -- coding_type_undecided (automatic conversion on decoding,
3465                                no-conversion on encoding)
3466
3467    `element[4]' contains information to be set in `coding->flags' and
3468    `coding->spec'.  The meaning varies by `coding->type'.
3469
3470    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3471    of length 32 (of which the first 13 sub-elements are used now).
3472    Meanings of these sub-elements are:
3473
3474    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3475         If the value is an integer of valid charset, the charset is
3476         assumed to be designated to graphic register N initially.
3477
3478         If the value is minus, it is a minus value of charset which
3479         reserves graphic register N, which means that the charset is
3480         not designated initially but should be designated to graphic
3481         register N just before encoding a character in that charset.
3482
3483         If the value is nil, graphic register N is never used on
3484         encoding.
3485
3486    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3487         Each value takes t or nil.  See the section ISO2022 of
3488         `coding.h' for more information.
3489
3490    If `coding->type' is `coding_type_big5', element[4] is t to denote
3491    BIG5-ETen or nil to denote BIG5-HKU.
3492
3493    If `coding->type' takes the other value, element[4] is ignored.
3494
3495    Emacs Lisp's coding systems also carry information about format of
3496    end-of-line in a value of property `eol-type'.  If the value is
3497    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3498    means CODING_EOL_CR.  If it is not integer, it should be a vector
3499    of subsidiary coding systems of which property `eol-type' has one
3500    of the above values.
3501
3502 */
3503
3504 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3505    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3506    is setup so that no conversion is necessary and return -1, else
3507    return 0.  */
3508
3509 int
3510 setup_coding_system (coding_system, coding)
3511      Lisp_Object coding_system;
3512      struct coding_system *coding;
3513 {
3514   Lisp_Object coding_spec, coding_type, eol_type, plist;
3515   Lisp_Object val;
3516
3517   /* At first, zero clear all members.  */
3518   bzero (coding, sizeof (struct coding_system));
3519
3520   /* Initialize some fields required for all kinds of coding systems.  */
3521   coding->symbol = coding_system;
3522   coding->heading_ascii = -1;
3523   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3524   coding->composing = COMPOSITION_DISABLED;
3525   coding->cmp_data = NULL;
3526
3527   if (NILP (coding_system))
3528     goto label_invalid_coding_system;
3529
3530   coding_spec = Fget (coding_system, Qcoding_system);
3531
3532   if (!VECTORP (coding_spec)
3533       || XVECTOR (coding_spec)->size != 5
3534       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3535     goto label_invalid_coding_system;
3536
3537   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3538   if (VECTORP (eol_type))
3539     {
3540       coding->eol_type = CODING_EOL_UNDECIDED;
3541       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3542     }
3543   else if (XFASTINT (eol_type) == 1)
3544     {
3545       coding->eol_type = CODING_EOL_CRLF;
3546       coding->common_flags
3547         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3548     }
3549   else if (XFASTINT (eol_type) == 2)
3550     {
3551       coding->eol_type = CODING_EOL_CR;
3552       coding->common_flags
3553         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3554     }
3555   else
3556     coding->eol_type = CODING_EOL_LF;
3557
3558   coding_type = XVECTOR (coding_spec)->contents[0];
3559   /* Try short cut.  */
3560   if (SYMBOLP (coding_type))
3561     {
3562       if (EQ (coding_type, Qt))
3563         {
3564           coding->type = coding_type_undecided;
3565           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3566         }
3567       else
3568         coding->type = coding_type_no_conversion;
3569       /* Initialize this member.  Any thing other than
3570          CODING_CATEGORY_IDX_UTF_16_BE and
3571          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3572          special treatment in detect_eol.  */
3573       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3574
3575       return 0;
3576     }
3577
3578   /* Get values of coding system properties:
3579      `post-read-conversion', `pre-write-conversion',
3580      `translation-table-for-decode', `translation-table-for-encode'.  */
3581   plist = XVECTOR (coding_spec)->contents[3];
3582   /* Pre & post conversion functions should be disabled if
3583      inhibit_eol_conversion is nonzero.  This is the case that a code
3584      conversion function is called while those functions are running.  */
3585   if (! inhibit_pre_post_conversion)
3586     {
3587       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3588       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3589     }
3590   val = Fplist_get (plist, Qtranslation_table_for_decode);
3591   if (SYMBOLP (val))
3592     val = Fget (val, Qtranslation_table_for_decode);
3593   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3594   val = Fplist_get (plist, Qtranslation_table_for_encode);
3595   if (SYMBOLP (val))
3596     val = Fget (val, Qtranslation_table_for_encode);
3597   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3598   val = Fplist_get (plist, Qcoding_category);
3599   if (!NILP (val))
3600     {
3601       val = Fget (val, Qcoding_category_index);
3602       if (INTEGERP (val))
3603         coding->category_idx = XINT (val);
3604       else
3605         goto label_invalid_coding_system;
3606     }
3607   else
3608     goto label_invalid_coding_system;
3609
3610   /* If the coding system has non-nil `composition' property, enable
3611      composition handling.  */
3612   val = Fplist_get (plist, Qcomposition);
3613   if (!NILP (val))
3614     coding->composing = COMPOSITION_NO;
3615
3616   switch (XFASTINT (coding_type))
3617     {
3618     case 0:
3619       coding->type = coding_type_emacs_mule;
3620       coding->common_flags
3621         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3622       if (!NILP (coding->post_read_conversion))
3623         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3624       if (!NILP (coding->pre_write_conversion))
3625         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3626       break;
3627
3628     case 1:
3629       coding->type = coding_type_sjis;
3630       coding->common_flags
3631         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3632       break;
3633
3634     case 2:
3635       coding->type = coding_type_iso2022;
3636       coding->common_flags
3637         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3638       {
3639         Lisp_Object val, temp;
3640         Lisp_Object *flags;
3641         int i, charset, reg_bits = 0;
3642
3643         val = XVECTOR (coding_spec)->contents[4];
3644
3645         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3646           goto label_invalid_coding_system;
3647
3648         flags = XVECTOR (val)->contents;
3649         coding->flags
3650           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3651              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3652              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3653              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3654              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3655              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3656              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3657              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3658              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3659              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3660              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3661              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3662              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3663              );
3664
3665         /* Invoke graphic register 0 to plane 0.  */
3666         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3667         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3668         CODING_SPEC_ISO_INVOCATION (coding, 1)
3669           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3670         /* Not single shifting at first.  */
3671         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3672         /* Beginning of buffer should also be regarded as bol. */
3673         CODING_SPEC_ISO_BOL (coding) = 1;
3674
3675         for (charset = 0; charset <= MAX_CHARSET; charset++)
3676           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3677         val = Vcharset_revision_alist;
3678         while (CONSP (val))
3679           {
3680             charset = get_charset_id (Fcar_safe (XCAR (val)));
3681             if (charset >= 0
3682                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3683                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3684               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3685             val = XCDR (val);
3686           }
3687
3688         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3689            FLAGS[REG] can be one of below:
3690                 integer CHARSET: CHARSET occupies register I,
3691                 t: designate nothing to REG initially, but can be used
3692                   by any charsets,
3693                 list of integer, nil, or t: designate the first
3694                   element (if integer) to REG initially, the remaining
3695                   elements (if integer) is designated to REG on request,
3696                   if an element is t, REG can be used by any charsets,
3697                 nil: REG is never used.  */
3698         for (charset = 0; charset <= MAX_CHARSET; charset++)
3699           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3700             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3701         for (i = 0; i < 4; i++)
3702           {
3703             if ((INTEGERP (flags[i])
3704                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3705                 || (charset = get_charset_id (flags[i])) >= 0)
3706               {
3707                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3708                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3709               }
3710             else if (EQ (flags[i], Qt))
3711               {
3712                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3713                 reg_bits |= 1 << i;
3714                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3715               }
3716             else if (CONSP (flags[i]))
3717               {
3718                 Lisp_Object tail;
3719                 tail = flags[i];
3720
3721                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3722                 if ((INTEGERP (XCAR (tail))
3723                      && (charset = XINT (XCAR (tail)),
3724                          CHARSET_VALID_P (charset)))
3725                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3726                   {
3727                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3728                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3729                   }
3730                 else
3731                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3732                 tail = XCDR (tail);
3733                 while (CONSP (tail))
3734                   {
3735                     if ((INTEGERP (XCAR (tail))
3736                          && (charset = XINT (XCAR (tail)),
3737                              CHARSET_VALID_P (charset)))
3738                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3739                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3740                         = i;
3741                     else if (EQ (XCAR (tail), Qt))
3742                       reg_bits |= 1 << i;
3743                     tail = XCDR (tail);
3744                   }
3745               }
3746             else
3747               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3748
3749             CODING_SPEC_ISO_DESIGNATION (coding, i)
3750               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3751           }
3752
3753         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3754           {
3755             /* REG 1 can be used only by locking shift in 7-bit env.  */
3756             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3757               reg_bits &= ~2;
3758             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3759               /* Without any shifting, only REG 0 and 1 can be used.  */
3760               reg_bits &= 3;
3761           }
3762
3763         if (reg_bits)
3764           for (charset = 0; charset <= MAX_CHARSET; charset++)
3765             {
3766               if (CHARSET_DEFINED_P (charset)
3767                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3768                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3769                 {
3770                   /* There exist some default graphic registers to be
3771                      used by CHARSET.  */
3772
3773                   /* We had better avoid designating a charset of
3774                      CHARS96 to REG 0 as far as possible.  */
3775                   if (CHARSET_CHARS (charset) == 96)
3776                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3777                       = (reg_bits & 2
3778                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3779                   else
3780                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3781                       = (reg_bits & 1
3782                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3783                 }
3784             }
3785       }
3786       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3787       coding->spec.iso2022.last_invalid_designation_register = -1;
3788       break;
3789
3790     case 3:
3791       coding->type = coding_type_big5;
3792       coding->common_flags
3793         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3794       coding->flags
3795         = (NILP (XVECTOR (coding_spec)->contents[4])
3796            ? CODING_FLAG_BIG5_HKU
3797            : CODING_FLAG_BIG5_ETEN);
3798       break;
3799
3800     case 4:
3801       coding->type = coding_type_ccl;
3802       coding->common_flags
3803         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3804       {
3805         val = XVECTOR (coding_spec)->contents[4];
3806         if (! CONSP (val)
3807             || setup_ccl_program (&(coding->spec.ccl.decoder),
3808                                   XCAR (val)) < 0
3809             || setup_ccl_program (&(coding->spec.ccl.encoder),
3810                                   XCDR (val)) < 0)
3811           goto label_invalid_coding_system;
3812
3813         bzero (coding->spec.ccl.valid_codes, 256);
3814         val = Fplist_get (plist, Qvalid_codes);
3815         if (CONSP (val))
3816           {
3817             Lisp_Object this;
3818
3819             for (; CONSP (val); val = XCDR (val))
3820               {
3821                 this = XCAR (val);
3822                 if (INTEGERP (this)
3823                     && XINT (this) >= 0 && XINT (this) < 256)
3824                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3825                 else if (CONSP (this)
3826                          && INTEGERP (XCAR (this))
3827                          && INTEGERP (XCDR (this)))
3828                   {
3829                     int start = XINT (XCAR (this));
3830                     int end = XINT (XCDR (this));
3831
3832                     if (start >= 0 && start <= end && end < 256)
3833                       while (start <= end)
3834                         coding->spec.ccl.valid_codes[start++] = 1;
3835                   }
3836               }
3837           }
3838       }
3839       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3840       coding->spec.ccl.cr_carryover = 0;
3841       coding->spec.ccl.eight_bit_carryover[0] = 0;
3842       break;
3843
3844     case 5:
3845       coding->type = coding_type_raw_text;
3846       break;
3847
3848     default:
3849       goto label_invalid_coding_system;
3850     }
3851   return 0;
3852
3853  label_invalid_coding_system:
3854   coding->type = coding_type_no_conversion;
3855   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3856   coding->common_flags = 0;
3857   coding->eol_type = CODING_EOL_LF;
3858   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3859   return -1;
3860 }
3861
3862 /* Free memory blocks allocated for storing composition information.  */
3863
3864 void
3865 coding_free_composition_data (coding)
3866      struct coding_system *coding;
3867 {
3868   struct composition_data *cmp_data = coding->cmp_data, *next;
3869
3870   if (!cmp_data)
3871     return;
3872   /* Memory blocks are chained.  At first, rewind to the first, then,
3873      free blocks one by one.  */
3874   while (cmp_data->prev)
3875     cmp_data = cmp_data->prev;
3876   while (cmp_data)
3877     {
3878       next = cmp_data->next;
3879       xfree (cmp_data);
3880       cmp_data = next;
3881     }
3882   coding->cmp_data = NULL;
3883 }
3884
3885 /* Set `char_offset' member of all memory blocks pointed by
3886    coding->cmp_data to POS.  */
3887
3888 void
3889 coding_adjust_composition_offset (coding, pos)
3890      struct coding_system *coding;
3891      int pos;
3892 {
3893   struct composition_data *cmp_data;
3894
3895   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3896     cmp_data->char_offset = pos;
3897 }
3898
3899 /* Setup raw-text or one of its subsidiaries in the structure
3900    coding_system CODING according to the already setup value eol_type
3901    in CODING.  CODING should be setup for some coding system in
3902    advance.  */
3903
3904 void
3905 setup_raw_text_coding_system (coding)
3906      struct coding_system *coding;
3907 {
3908   if (coding->type != coding_type_raw_text)
3909     {
3910       coding->symbol = Qraw_text;
3911       coding->type = coding_type_raw_text;
3912       if (coding->eol_type != CODING_EOL_UNDECIDED)
3913         {
3914           Lisp_Object subsidiaries;
3915           subsidiaries = Fget (Qraw_text, Qeol_type);
3916
3917           if (VECTORP (subsidiaries)
3918               && XVECTOR (subsidiaries)->size == 3)
3919             coding->symbol
3920               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3921         }
3922       setup_coding_system (coding->symbol, coding);
3923     }
3924   return;
3925 }
3926
3927 /* Emacs has a mechanism to automatically detect a coding system if it
3928    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3929    it's impossible to distinguish some coding systems accurately
3930    because they use the same range of codes.  So, at first, coding
3931    systems are categorized into 7, those are:
3932
3933    o coding-category-emacs-mule
3934
3935         The category for a coding system which has the same code range
3936         as Emacs' internal format.  Assigned the coding-system (Lisp
3937         symbol) `emacs-mule' by default.
3938
3939    o coding-category-sjis
3940
3941         The category for a coding system which has the same code range
3942         as SJIS.  Assigned the coding-system (Lisp
3943         symbol) `japanese-shift-jis' by default.
3944
3945    o coding-category-iso-7
3946
3947         The category for a coding system which has the same code range
3948         as ISO2022 of 7-bit environment.  This doesn't use any locking
3949         shift and single shift functions.  This can encode/decode all
3950         charsets.  Assigned the coding-system (Lisp symbol)
3951         `iso-2022-7bit' by default.
3952
3953    o coding-category-iso-7-tight
3954
3955         Same as coding-category-iso-7 except that this can
3956         encode/decode only the specified charsets.
3957
3958    o coding-category-iso-8-1
3959
3960         The category for a coding system which has the same code range
3961         as ISO2022 of 8-bit environment and graphic plane 1 used only
3962         for DIMENSION1 charset.  This doesn't use any locking shift
3963         and single shift functions.  Assigned the coding-system (Lisp
3964         symbol) `iso-latin-1' by default.
3965
3966    o coding-category-iso-8-2
3967
3968         The category for a coding system which has the same code range
3969         as ISO2022 of 8-bit environment and graphic plane 1 used only
3970         for DIMENSION2 charset.  This doesn't use any locking shift
3971         and single shift functions.  Assigned the coding-system (Lisp
3972         symbol) `japanese-iso-8bit' by default.
3973
3974    o coding-category-iso-7-else
3975
3976         The category for a coding system which has the same code range
3977         as ISO2022 of 7-bit environment but uses locking shift or
3978         single shift functions.  Assigned the coding-system (Lisp
3979         symbol) `iso-2022-7bit-lock' by default.
3980
3981    o coding-category-iso-8-else
3982
3983         The category for a coding system which has the same code range
3984         as ISO2022 of 8-bit environment but uses locking shift or
3985         single shift functions.  Assigned the coding-system (Lisp
3986         symbol) `iso-2022-8bit-ss2' by default.
3987
3988    o coding-category-big5
3989
3990         The category for a coding system which has the same code range
3991         as BIG5.  Assigned the coding-system (Lisp symbol)
3992         `cn-big5' by default.
3993
3994    o coding-category-utf-8
3995
3996         The category for a coding system which has the same code range
3997         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3998         symbol) `utf-8' by default.
3999
4000    o coding-category-utf-16-be
4001
4002         The category for a coding system in which a text has an
4003         Unicode signature (cf. Unicode Standard) in the order of BIG
4004         endian at the head.  Assigned the coding-system (Lisp symbol)
4005         `utf-16-be' by default.
4006
4007    o coding-category-utf-16-le
4008
4009         The category for a coding system in which a text has an
4010         Unicode signature (cf. Unicode Standard) in the order of
4011         LITTLE endian at the head.  Assigned the coding-system (Lisp
4012         symbol) `utf-16-le' by default.
4013
4014    o coding-category-ccl
4015
4016         The category for a coding system of which encoder/decoder is
4017         written in CCL programs.  The default value is nil, i.e., no
4018         coding system is assigned.
4019
4020    o coding-category-binary
4021
4022         The category for a coding system not categorized in any of the
4023         above.  Assigned the coding-system (Lisp symbol)
4024         `no-conversion' by default.
4025
4026    Each of them is a Lisp symbol and the value is an actual
4027    `coding-system' (this is also a Lisp symbol) assigned by a user.
4028    What Emacs does actually is to detect a category of coding system.
4029    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4030    decide a single possible category, it selects a category of the
4031    highest priority.  Priorities of categories are also specified by a
4032    user in a Lisp variable `coding-category-list'.
4033
4034 */
4035
4036 static
4037 int ascii_skip_code[256];
4038
4039 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4040    If it detects possible coding systems, return an integer in which
4041    appropriate flag bits are set.  Flag bits are defined by macros
4042    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4043    it should point the table `coding_priorities'.  In that case, only
4044    the flag bit for a coding system of the highest priority is set in
4045    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4046    range 0x80..0x9F are in multibyte form.
4047
4048    How many ASCII characters are at the head is returned as *SKIP.  */
4049
4050 static int
4051 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4052      unsigned char *source;
4053      int src_bytes, *priorities, *skip;
4054      int multibytep;
4055 {
4056   register unsigned char c;
4057   unsigned char *src = source, *src_end = source + src_bytes;
4058   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4059   int i;
4060
4061   /* At first, skip all ASCII characters and control characters except
4062      for three ISO2022 specific control characters.  */
4063   ascii_skip_code[ISO_CODE_SO] = 0;
4064   ascii_skip_code[ISO_CODE_SI] = 0;
4065   ascii_skip_code[ISO_CODE_ESC] = 0;
4066
4067  label_loop_detect_coding:
4068   while (src < src_end && ascii_skip_code[*src]) src++;
4069   *skip = src - source;
4070
4071   if (src >= src_end)
4072     /* We found nothing other than ASCII.  There's nothing to do.  */
4073     return 0;
4074
4075   c = *src;
4076   /* The text seems to be encoded in some multilingual coding system.
4077      Now, try to find in which coding system the text is encoded.  */
4078   if (c < 0x80)
4079     {
4080       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4081       /* C is an ISO2022 specific control code of C0.  */
4082       mask = detect_coding_iso2022 (src, src_end, multibytep);
4083       if (mask == 0)
4084         {
4085           /* No valid ISO2022 code follows C.  Try again.  */
4086           src++;
4087           if (c == ISO_CODE_ESC)
4088             ascii_skip_code[ISO_CODE_ESC] = 1;
4089           else
4090             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4091           goto label_loop_detect_coding;
4092         }
4093       if (priorities)
4094         {
4095           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4096             {
4097               if (mask & priorities[i])
4098                 return priorities[i];
4099             }
4100           return CODING_CATEGORY_MASK_RAW_TEXT;
4101         }
4102     }
4103   else
4104     {
4105       int try;
4106
4107       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4108         c = src[1] - 0x20;
4109
4110       if (c < 0xA0)
4111         {
4112           /* C is the first byte of SJIS character code,
4113              or a leading-code of Emacs' internal format (emacs-mule),
4114              or the first byte of UTF-16.  */
4115           try = (CODING_CATEGORY_MASK_SJIS
4116                   | CODING_CATEGORY_MASK_EMACS_MULE
4117                   | CODING_CATEGORY_MASK_UTF_16_BE
4118                   | CODING_CATEGORY_MASK_UTF_16_LE);
4119
4120           /* Or, if C is a special latin extra code,
4121              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4122              or is an ISO2022 control-sequence-introducer (CSI),
4123              we should also consider the possibility of ISO2022 codings.  */
4124           if ((VECTORP (Vlatin_extra_code_table)
4125                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4126               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4127               || (c == ISO_CODE_CSI
4128                   && (src < src_end
4129                       && (*src == ']'
4130                           || ((*src == '0' || *src == '1' || *src == '2')
4131                               && src + 1 < src_end
4132                               && src[1] == ']')))))
4133             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4134                      | CODING_CATEGORY_MASK_ISO_8BIT);
4135         }
4136       else
4137         /* C is a character of ISO2022 in graphic plane right,
4138            or a SJIS's 1-byte character code (i.e. JISX0201),
4139            or the first byte of BIG5's 2-byte code,
4140            or the first byte of UTF-8/16.  */
4141         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4142                 | CODING_CATEGORY_MASK_ISO_8BIT
4143                 | CODING_CATEGORY_MASK_SJIS
4144                 | CODING_CATEGORY_MASK_BIG5
4145                 | CODING_CATEGORY_MASK_UTF_8
4146                 | CODING_CATEGORY_MASK_UTF_16_BE
4147                 | CODING_CATEGORY_MASK_UTF_16_LE);
4148
4149       /* Or, we may have to consider the possibility of CCL.  */
4150       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4151           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4152               ->spec.ccl.valid_codes)[c])
4153         try |= CODING_CATEGORY_MASK_CCL;
4154
4155       mask = 0;
4156       utf16_examined_p = iso2022_examined_p = 0;
4157       if (priorities)
4158         {
4159           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4160             {
4161               if (!iso2022_examined_p
4162                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4163                 {
4164                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4165                   iso2022_examined_p = 1;
4166                 }
4167               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4168                 mask |= detect_coding_sjis (src, src_end, multibytep);
4169               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4170                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4171               else if (!utf16_examined_p
4172                        && (priorities[i] & try &
4173                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4174                 {
4175                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4176                   utf16_examined_p = 1;
4177                 }
4178               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4179                 mask |= detect_coding_big5 (src, src_end, multibytep);
4180               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4181                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4182               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4183                 mask |= detect_coding_ccl (src, src_end, multibytep);
4184               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4185                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4186               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4187                 mask |= CODING_CATEGORY_MASK_BINARY;
4188               if (mask & priorities[i])
4189                 return priorities[i];
4190             }
4191           return CODING_CATEGORY_MASK_RAW_TEXT;
4192         }
4193       if (try & CODING_CATEGORY_MASK_ISO)
4194         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4195       if (try & CODING_CATEGORY_MASK_SJIS)
4196         mask |= detect_coding_sjis (src, src_end, multibytep);
4197       if (try & CODING_CATEGORY_MASK_BIG5)
4198         mask |= detect_coding_big5 (src, src_end, multibytep);
4199       if (try & CODING_CATEGORY_MASK_UTF_8)
4200         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4201       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4202         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4203       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4204         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4205       if (try & CODING_CATEGORY_MASK_CCL)
4206         mask |= detect_coding_ccl (src, src_end, multibytep);
4207     }
4208   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4209 }
4210
4211 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4212    The information of the detected coding system is set in CODING.  */
4213
4214 void
4215 detect_coding (coding, src, src_bytes)
4216      struct coding_system *coding;
4217      const unsigned char *src;
4218      int src_bytes;
4219 {
4220   unsigned int idx;
4221   int skip, mask;
4222   Lisp_Object val;
4223
4224   val = Vcoding_category_list;
4225   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4226                              coding->src_multibyte);
4227   coding->heading_ascii = skip;
4228
4229   if (!mask) return;
4230
4231   /* We found a single coding system of the highest priority in MASK.  */
4232   idx = 0;
4233   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4234   if (! mask)
4235     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4236
4237   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4238
4239   if (coding->eol_type != CODING_EOL_UNDECIDED)
4240     {
4241       Lisp_Object tmp;
4242
4243       tmp = Fget (val, Qeol_type);
4244       if (VECTORP (tmp))
4245         val = XVECTOR (tmp)->contents[coding->eol_type];
4246     }
4247
4248   /* Setup this new coding system while preserving some slots.  */
4249   {
4250     int src_multibyte = coding->src_multibyte;
4251     int dst_multibyte = coding->dst_multibyte;
4252
4253     setup_coding_system (val, coding);
4254     coding->src_multibyte = src_multibyte;
4255     coding->dst_multibyte = dst_multibyte;
4256     coding->heading_ascii = skip;
4257   }
4258 }
4259
4260 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4261    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4262    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4263
4264    How many non-eol characters are at the head is returned as *SKIP.  */
4265
4266 #define MAX_EOL_CHECK_COUNT 3
4267
4268 static int
4269 detect_eol_type (source, src_bytes, skip)
4270      unsigned char *source;
4271      int src_bytes, *skip;
4272 {
4273   unsigned char *src = source, *src_end = src + src_bytes;
4274   unsigned char c;
4275   int total = 0;                /* How many end-of-lines are found so far.  */
4276   int eol_type = CODING_EOL_UNDECIDED;
4277   int this_eol_type;
4278
4279   *skip = 0;
4280
4281   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4282     {
4283       c = *src++;
4284       if (c == '\n' || c == '\r')
4285         {
4286           if (*skip == 0)
4287             *skip = src - 1 - source;
4288           total++;
4289           if (c == '\n')
4290             this_eol_type = CODING_EOL_LF;
4291           else if (src >= src_end || *src != '\n')
4292             this_eol_type = CODING_EOL_CR;
4293           else
4294             this_eol_type = CODING_EOL_CRLF, src++;
4295
4296           if (eol_type == CODING_EOL_UNDECIDED)
4297             /* This is the first end-of-line.  */
4298             eol_type = this_eol_type;
4299           else if (eol_type != this_eol_type)
4300             {
4301               /* The found type is different from what found before.  */
4302               eol_type = CODING_EOL_INCONSISTENT;
4303               break;
4304             }
4305         }
4306     }
4307
4308   if (*skip == 0)
4309     *skip = src_end - source;
4310   return eol_type;
4311 }
4312
4313 /* Like detect_eol_type, but detect EOL type in 2-octet
4314    big-endian/little-endian format for coding systems utf-16-be and
4315    utf-16-le.  */
4316
4317 static int
4318 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4319      unsigned char *source;
4320      int src_bytes, *skip, big_endian_p;
4321 {
4322   unsigned char *src = source, *src_end = src + src_bytes;
4323   unsigned int c1, c2;
4324   int total = 0;                /* How many end-of-lines are found so far.  */
4325   int eol_type = CODING_EOL_UNDECIDED;
4326   int this_eol_type;
4327   int msb, lsb;
4328
4329   if (big_endian_p)
4330     msb = 0, lsb = 1;
4331   else
4332     msb = 1, lsb = 0;
4333
4334   *skip = 0;
4335
4336   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4337     {
4338       c1 = (src[msb] << 8) | (src[lsb]);
4339       src += 2;
4340
4341       if (c1 == '\n' || c1 == '\r')
4342         {
4343           if (*skip == 0)
4344             *skip = src - 2 - source;
4345           total++;
4346           if (c1 == '\n')
4347             {
4348               this_eol_type = CODING_EOL_LF;
4349             }
4350           else
4351             {
4352               if ((src + 1) >= src_end)
4353                 {
4354                   this_eol_type = CODING_EOL_CR;
4355                 }
4356               else
4357                 {
4358                   c2 = (src[msb] << 8) | (src[lsb]);
4359                   if (c2 == '\n')
4360                     this_eol_type = CODING_EOL_CRLF, src += 2;
4361                   else
4362                     this_eol_type = CODING_EOL_CR;
4363                 }
4364             }
4365
4366           if (eol_type == CODING_EOL_UNDECIDED)
4367             /* This is the first end-of-line.  */
4368             eol_type = this_eol_type;
4369           else if (eol_type != this_eol_type)
4370             {
4371               /* The found type is different from what found before.  */
4372               eol_type = CODING_EOL_INCONSISTENT;
4373               break;
4374             }
4375         }
4376     }
4377
4378   if (*skip == 0)
4379     *skip = src_end - source;
4380   return eol_type;
4381 }
4382
4383 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4384    is encoded.  If it detects an appropriate format of end-of-line, it
4385    sets the information in *CODING.  */
4386
4387 void
4388 detect_eol (coding, src, src_bytes)
4389      struct coding_system *coding;
4390      const unsigned char *src;
4391      int src_bytes;
4392 {
4393   Lisp_Object val;
4394   int skip;
4395   int eol_type;
4396
4397   switch (coding->category_idx)
4398     {
4399     case CODING_CATEGORY_IDX_UTF_16_BE:
4400       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4401       break;
4402     case CODING_CATEGORY_IDX_UTF_16_LE:
4403       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4404       break;
4405     default:
4406       eol_type = detect_eol_type (src, src_bytes, &skip);
4407       break;
4408     }
4409
4410   if (coding->heading_ascii > skip)
4411     coding->heading_ascii = skip;
4412   else
4413     skip = coding->heading_ascii;
4414
4415   if (eol_type == CODING_EOL_UNDECIDED)
4416     return;
4417   if (eol_type == CODING_EOL_INCONSISTENT)
4418     {
4419 #if 0
4420       /* This code is suppressed until we find a better way to
4421          distinguish raw text file and binary file.  */
4422
4423       /* If we have already detected that the coding is raw-text, the
4424          coding should actually be no-conversion.  */
4425       if (coding->type == coding_type_raw_text)
4426         {
4427           setup_coding_system (Qno_conversion, coding);
4428           return;
4429         }
4430       /* Else, let's decode only text code anyway.  */
4431 #endif /* 0 */
4432       eol_type = CODING_EOL_LF;
4433     }
4434
4435   val = Fget (coding->symbol, Qeol_type);
4436   if (VECTORP (val) && XVECTOR (val)->size == 3)
4437     {
4438       int src_multibyte = coding->src_multibyte;
4439       int dst_multibyte = coding->dst_multibyte;
4440       struct composition_data *cmp_data = coding->cmp_data;
4441
4442       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4443       coding->src_multibyte = src_multibyte;
4444       coding->dst_multibyte = dst_multibyte;
4445       coding->heading_ascii = skip;
4446       coding->cmp_data = cmp_data;
4447     }
4448 }
4449
4450 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4451
4452 #define DECODING_BUFFER_MAG(coding)                     \
4453   (coding->type == coding_type_iso2022                  \
4454    ? 3                                                  \
4455    : (coding->type == coding_type_ccl                   \
4456       ? coding->spec.ccl.decoder.buf_magnification      \
4457       : 2))
4458
4459 /* Return maximum size (bytes) of a buffer enough for decoding
4460    SRC_BYTES of text encoded in CODING.  */
4461
4462 int
4463 decoding_buffer_size (coding, src_bytes)
4464      struct coding_system *coding;
4465      int src_bytes;
4466 {
4467   return (src_bytes * DECODING_BUFFER_MAG (coding)
4468           + CONVERSION_BUFFER_EXTRA_ROOM);
4469 }
4470
4471 /* Return maximum size (bytes) of a buffer enough for encoding
4472    SRC_BYTES of text to CODING.  */
4473
4474 int
4475 encoding_buffer_size (coding, src_bytes)
4476      struct coding_system *coding;
4477      int src_bytes;
4478 {
4479   int magnification;
4480
4481   if (coding->type == coding_type_ccl)
4482     magnification = coding->spec.ccl.encoder.buf_magnification;
4483   else if (CODING_REQUIRE_ENCODING (coding))
4484     magnification = 3;
4485   else
4486     magnification = 1;
4487
4488   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4489 }
4490
4491 /* Working buffer for code conversion.  */
4492 struct conversion_buffer
4493 {
4494   int size;                     /* size of data.  */
4495   int on_stack;                 /* 1 if allocated by alloca.  */
4496   unsigned char *data;
4497 };
4498
4499 /* Don't use alloca for allocating memory space larger than this, lest
4500    we overflow their stack.  */
4501 #define MAX_ALLOCA 16*1024
4502
4503 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4504 #define allocate_conversion_buffer(buf, len)            \
4505   do {                                                  \
4506     if (len < MAX_ALLOCA)                               \
4507       {                                                 \
4508         buf.data = (unsigned char *) alloca (len);      \
4509         buf.on_stack = 1;                               \
4510       }                                                 \
4511     else                                                \
4512       {                                                 \
4513         buf.data = (unsigned char *) xmalloc (len);     \
4514         buf.on_stack = 0;                               \
4515       }                                                 \
4516     buf.size = len;                                     \
4517   } while (0)
4518
4519 /* Double the allocated memory for *BUF.  */
4520 static void
4521 extend_conversion_buffer (buf)
4522      struct conversion_buffer *buf;
4523 {
4524   if (buf->on_stack)
4525     {
4526       unsigned char *save = buf->data;
4527       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4528       bcopy (save, buf->data, buf->size);
4529       buf->on_stack = 0;
4530     }
4531   else
4532     {
4533       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4534     }
4535   buf->size *= 2;
4536 }
4537
4538 /* Free the allocated memory for BUF if it is not on stack.  */
4539 static void
4540 free_conversion_buffer (buf)
4541      struct conversion_buffer *buf;
4542 {
4543   if (!buf->on_stack)
4544     xfree (buf->data);
4545 }
4546
4547 int
4548 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4549      struct coding_system *coding;
4550      unsigned char *source, *destination;
4551      int src_bytes, dst_bytes, encodep;
4552 {
4553   struct ccl_program *ccl
4554     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4555   unsigned char *dst = destination;
4556
4557   ccl->suppress_error = coding->suppress_error;
4558   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4559   if (encodep)
4560     {
4561       /* On encoding, EOL format is converted within ccl_driver.  For
4562          that, setup proper information in the structure CCL.  */
4563       ccl->eol_type = coding->eol_type;
4564       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4565         ccl->eol_type = CODING_EOL_LF;
4566       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4567       ccl->eight_bit_control = coding->dst_multibyte;
4568     }
4569   else
4570     ccl->eight_bit_control = 1;
4571   ccl->multibyte = coding->src_multibyte;
4572   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4573     {
4574       /* Move carryover bytes to DESTINATION.  */
4575       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4576       while (*p)
4577         *dst++ = *p++;
4578       coding->spec.ccl.eight_bit_carryover[0] = 0;
4579       if (dst_bytes)
4580         dst_bytes -= dst - destination;
4581     }
4582
4583   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4584                                   &(coding->consumed))
4585                       + dst - destination);
4586
4587   if (encodep)
4588     {
4589       coding->produced_char = coding->produced;
4590       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4591     }
4592   else if (!ccl->eight_bit_control)
4593     {
4594       /* The produced bytes forms a valid multibyte sequence. */
4595       coding->produced_char
4596         = multibyte_chars_in_text (destination, coding->produced);
4597       coding->spec.ccl.eight_bit_carryover[0] = 0;
4598     }
4599   else
4600     {
4601       /* On decoding, the destination should always multibyte.  But,
4602          CCL program might have been generated an invalid multibyte
4603          sequence.  Here we make such a sequence valid as
4604          multibyte.  */
4605       int bytes
4606         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4607
4608       if ((coding->consumed < src_bytes
4609            || !ccl->last_block)
4610           && coding->produced >= 1
4611           && destination[coding->produced - 1] >= 0x80)
4612         {
4613           /* We should not convert the tailing 8-bit codes to
4614              multibyte form even if they doesn't form a valid
4615              multibyte sequence.  They may form a valid sequence in
4616              the next call.  */
4617           int carryover = 0;
4618
4619           if (destination[coding->produced - 1] < 0xA0)
4620             carryover = 1;
4621           else if (coding->produced >= 2)
4622             {
4623               if (destination[coding->produced - 2] >= 0x80)
4624                 {
4625                   if (destination[coding->produced - 2] < 0xA0)
4626                     carryover = 2;
4627                   else if (coding->produced >= 3
4628                            && destination[coding->produced - 3] >= 0x80
4629                            && destination[coding->produced - 3] < 0xA0)
4630                     carryover = 3;
4631                 }
4632             }
4633           if (carryover > 0)
4634             {
4635               BCOPY_SHORT (destination + coding->produced - carryover,
4636                            coding->spec.ccl.eight_bit_carryover,
4637                            carryover);
4638               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4639               coding->produced -= carryover;
4640             }
4641         }
4642       coding->produced = str_as_multibyte (destination, bytes,
4643                                            coding->produced,
4644                                            &(coding->produced_char));
4645     }
4646
4647   switch (ccl->status)
4648     {
4649     case CCL_STAT_SUSPEND_BY_SRC:
4650       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4651       break;
4652     case CCL_STAT_SUSPEND_BY_DST:
4653       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4654       break;
4655     case CCL_STAT_QUIT:
4656     case CCL_STAT_INVALID_CMD:
4657       coding->result = CODING_FINISH_INTERRUPT;
4658       break;
4659     default:
4660       coding->result = CODING_FINISH_NORMAL;
4661       break;
4662     }
4663   return coding->result;
4664 }
4665
4666 /* Decode EOL format of the text at PTR of BYTES length destructively
4667    according to CODING->eol_type.  This is called after the CCL
4668    program produced a decoded text at PTR.  If we do CRLF->LF
4669    conversion, update CODING->produced and CODING->produced_char.  */
4670
4671 static void
4672 decode_eol_post_ccl (coding, ptr, bytes)
4673      struct coding_system *coding;
4674      unsigned char *ptr;
4675      int bytes;
4676 {
4677   Lisp_Object val, saved_coding_symbol;
4678   unsigned char *pend = ptr + bytes;
4679   int dummy;
4680
4681   /* Remember the current coding system symbol.  We set it back when
4682      an inconsistent EOL is found so that `last-coding-system-used' is
4683      set to the coding system that doesn't specify EOL conversion.  */
4684   saved_coding_symbol = coding->symbol;
4685
4686   coding->spec.ccl.cr_carryover = 0;
4687   if (coding->eol_type == CODING_EOL_UNDECIDED)
4688     {
4689       /* Here, to avoid the call of setup_coding_system, we directly
4690          call detect_eol_type.  */
4691       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4692       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4693         coding->eol_type = CODING_EOL_LF;
4694       if (coding->eol_type != CODING_EOL_UNDECIDED)
4695         {
4696           val = Fget (coding->symbol, Qeol_type);
4697           if (VECTORP (val) && XVECTOR (val)->size == 3)
4698             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4699         }
4700       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4701     }
4702
4703   if (coding->eol_type == CODING_EOL_LF
4704       || coding->eol_type == CODING_EOL_UNDECIDED)
4705     {
4706       /* We have nothing to do.  */
4707       ptr = pend;
4708     }
4709   else if (coding->eol_type == CODING_EOL_CRLF)
4710     {
4711       unsigned char *pstart = ptr, *p = ptr;
4712
4713       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4714           && *(pend - 1) == '\r')
4715         {
4716           /* If the last character is CR, we can't handle it here
4717              because LF will be in the not-yet-decoded source text.
4718              Record that the CR is not yet processed.  */
4719           coding->spec.ccl.cr_carryover = 1;
4720           coding->produced--;
4721           coding->produced_char--;
4722           pend--;
4723         }
4724       while (ptr < pend)
4725         {
4726           if (*ptr == '\r')
4727             {
4728               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4729                 {
4730                   *p++ = '\n';
4731                   ptr += 2;
4732                 }
4733               else
4734                 {
4735                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4736                     goto undo_eol_conversion;
4737                   *p++ = *ptr++;
4738                 }
4739             }
4740           else if (*ptr == '\n'
4741                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4742             goto undo_eol_conversion;
4743           else
4744             *p++ = *ptr++;
4745           continue;
4746
4747         undo_eol_conversion:
4748           /* We have faced with inconsistent EOL format at PTR.
4749              Convert all LFs before PTR back to CRLFs.  */
4750           for (p--, ptr--; p >= pstart; p--)
4751             {
4752               if (*p == '\n')
4753                 *ptr-- = '\n', *ptr-- = '\r';
4754               else
4755                 *ptr-- = *p;
4756             }
4757           /*  If carryover is recorded, cancel it because we don't
4758               convert CRLF anymore.  */
4759           if (coding->spec.ccl.cr_carryover)
4760             {
4761               coding->spec.ccl.cr_carryover = 0;
4762               coding->produced++;
4763               coding->produced_char++;
4764               pend++;
4765             }
4766           p = ptr = pend;
4767           coding->eol_type = CODING_EOL_LF;
4768           coding->symbol = saved_coding_symbol;
4769         }
4770       if (p < pend)
4771         {
4772           /* As each two-byte sequence CRLF was converted to LF, (PEND
4773              - P) is the number of deleted characters.  */
4774           coding->produced -= pend - p;
4775           coding->produced_char -= pend - p;
4776         }
4777     }
4778   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4779     {
4780       unsigned char *p = ptr;
4781
4782       for (; ptr < pend; ptr++)
4783         {
4784           if (*ptr == '\r')
4785             *ptr = '\n';
4786           else if (*ptr == '\n'
4787                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4788             {
4789               for (; p < ptr; p++)
4790                 {
4791                   if (*p == '\n')
4792                     *p = '\r';
4793                 }
4794               ptr = pend;
4795               coding->eol_type = CODING_EOL_LF;
4796               coding->symbol = saved_coding_symbol;
4797             }
4798         }
4799     }
4800 }
4801
4802 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4803    decoding, it may detect coding system and format of end-of-line if
4804    those are not yet decided.  The source should be unibyte, the
4805    result is multibyte if CODING->dst_multibyte is nonzero, else
4806    unibyte.  */
4807
4808 int
4809 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4810      struct coding_system *coding;
4811      const unsigned char *source;
4812      unsigned char *destination;
4813      int src_bytes, dst_bytes;
4814 {
4815   int extra = 0;
4816
4817   if (coding->type == coding_type_undecided)
4818     detect_coding (coding, source, src_bytes);
4819
4820   if (coding->eol_type == CODING_EOL_UNDECIDED
4821       && coding->type != coding_type_ccl)
4822     {
4823       detect_eol (coding, source, src_bytes);
4824       /* We had better recover the original eol format if we
4825          encounter an inconsistent eol format while decoding.  */
4826       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4827     }
4828
4829   coding->produced = coding->produced_char = 0;
4830   coding->consumed = coding->consumed_char = 0;
4831   coding->errors = 0;
4832   coding->result = CODING_FINISH_NORMAL;
4833
4834   switch (coding->type)
4835     {
4836     case coding_type_sjis:
4837       decode_coding_sjis_big5 (coding, source, destination,
4838                                src_bytes, dst_bytes, 1);
4839       break;
4840
4841     case coding_type_iso2022:
4842       decode_coding_iso2022 (coding, source, destination,
4843                              src_bytes, dst_bytes);
4844       break;
4845
4846     case coding_type_big5:
4847       decode_coding_sjis_big5 (coding, source, destination,
4848                                src_bytes, dst_bytes, 0);
4849       break;
4850
4851     case coding_type_emacs_mule:
4852       decode_coding_emacs_mule (coding, source, destination,
4853                                 src_bytes, dst_bytes);
4854       break;
4855
4856     case coding_type_ccl:
4857       if (coding->spec.ccl.cr_carryover)
4858         {
4859           /* Put the CR which was not processed by the previous call
4860              of decode_eol_post_ccl in DESTINATION.  It will be
4861              decoded together with the following LF by the call to
4862              decode_eol_post_ccl below.  */
4863           *destination = '\r';
4864           coding->produced++;
4865           coding->produced_char++;
4866           dst_bytes--;
4867           extra = coding->spec.ccl.cr_carryover;
4868         }
4869       ccl_coding_driver (coding, source, destination + extra,
4870                          src_bytes, dst_bytes, 0);
4871       if (coding->eol_type != CODING_EOL_LF)
4872         {
4873           coding->produced += extra;
4874           coding->produced_char += extra;
4875           decode_eol_post_ccl (coding, destination, coding->produced);
4876         }
4877       break;
4878
4879     default:
4880       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4881     }
4882
4883   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4884       && coding->mode & CODING_MODE_LAST_BLOCK
4885       && coding->consumed == src_bytes)
4886     coding->result = CODING_FINISH_NORMAL;
4887
4888   if (coding->mode & CODING_MODE_LAST_BLOCK
4889       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4890     {
4891       const unsigned char *src = source + coding->consumed;
4892       unsigned char *dst = destination + coding->produced;
4893
4894       src_bytes -= coding->consumed;
4895       coding->errors++;
4896       if (COMPOSING_P (coding))
4897         DECODE_COMPOSITION_END ('1');
4898       while (src_bytes--)
4899         {
4900           int c = *src++;
4901           dst += CHAR_STRING (c, dst);
4902           coding->produced_char++;
4903         }
4904       coding->consumed = coding->consumed_char = src - source;
4905       coding->produced = dst - destination;
4906       coding->result = CODING_FINISH_NORMAL;
4907     }
4908
4909   if (!coding->dst_multibyte)
4910     {
4911       coding->produced = str_as_unibyte (destination, coding->produced);
4912       coding->produced_char = coding->produced;
4913     }
4914
4915   return coding->result;
4916 }
4917
4918 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4919    multibyteness of the source is CODING->src_multibyte, the
4920    multibyteness of the result is always unibyte.  */
4921
4922 int
4923 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4924      struct coding_system *coding;
4925      const unsigned char *source;
4926      unsigned char *destination;
4927      int src_bytes, dst_bytes;
4928 {
4929   coding->produced = coding->produced_char = 0;
4930   coding->consumed = coding->consumed_char = 0;
4931   coding->errors = 0;
4932   coding->result = CODING_FINISH_NORMAL;
4933
4934   switch (coding->type)
4935     {
4936     case coding_type_sjis:
4937       encode_coding_sjis_big5 (coding, source, destination,
4938                                src_bytes, dst_bytes, 1);
4939       break;
4940
4941     case coding_type_iso2022:
4942       encode_coding_iso2022 (coding, source, destination,
4943                              src_bytes, dst_bytes);
4944       break;
4945
4946     case coding_type_big5:
4947       encode_coding_sjis_big5 (coding, source, destination,
4948                                src_bytes, dst_bytes, 0);
4949       break;
4950
4951     case coding_type_emacs_mule:
4952       encode_coding_emacs_mule (coding, source, destination,
4953                                 src_bytes, dst_bytes);
4954       break;
4955
4956     case coding_type_ccl:
4957       ccl_coding_driver (coding, source, destination,
4958                          src_bytes, dst_bytes, 1);
4959       break;
4960
4961     default:
4962       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4963     }
4964
4965   if (coding->mode & CODING_MODE_LAST_BLOCK
4966       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4967     {
4968       const unsigned char *src = source + coding->consumed;
4969       unsigned char *dst = destination + coding->produced;
4970
4971       if (coding->type == coding_type_iso2022)
4972         ENCODE_RESET_PLANE_AND_REGISTER;
4973       if (COMPOSING_P (coding))
4974         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4975       if (coding->consumed < src_bytes)
4976         {
4977           int len = src_bytes - coding->consumed;
4978
4979           BCOPY_SHORT (src, dst, len);
4980           if (coding->src_multibyte)
4981             len = str_as_unibyte (dst, len);
4982           dst += len;
4983           coding->consumed = src_bytes;
4984         }
4985       coding->produced = coding->produced_char = dst - destination;
4986       coding->result = CODING_FINISH_NORMAL;
4987     }
4988
4989   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4990       && coding->consumed == src_bytes)
4991     coding->result = CODING_FINISH_NORMAL;
4992
4993   return coding->result;
4994 }
4995
4996 /* Scan text in the region between *BEG and *END (byte positions),
4997    skip characters which we don't have to decode by coding system
4998    CODING at the head and tail, then set *BEG and *END to the region
4999    of the text we actually have to convert.  The caller should move
5000    the gap out of the region in advance if the region is from a
5001    buffer.
5002
5003    If STR is not NULL, *BEG and *END are indices into STR.  */
5004
5005 static void
5006 shrink_decoding_region (beg, end, coding, str)
5007      int *beg, *end;
5008      struct coding_system *coding;
5009      unsigned char *str;
5010 {
5011   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5012   int eol_conversion;
5013   Lisp_Object translation_table;
5014
5015   if (coding->type == coding_type_ccl
5016       || coding->type == coding_type_undecided
5017       || coding->eol_type != CODING_EOL_LF
5018       || !NILP (coding->post_read_conversion)
5019       || coding->composing != COMPOSITION_DISABLED)
5020     {
5021       /* We can't skip any data.  */
5022       return;
5023     }
5024   if (coding->type == coding_type_no_conversion
5025       || coding->type == coding_type_raw_text
5026       || coding->type == coding_type_emacs_mule)
5027     {
5028       /* We need no conversion, but don't have to skip any data here.
5029          Decoding routine handles them effectively anyway.  */
5030       return;
5031     }
5032
5033   translation_table = coding->translation_table_for_decode;
5034   if (NILP (translation_table) && !NILP (Venable_character_translation))
5035     translation_table = Vstandard_translation_table_for_decode;
5036   if (CHAR_TABLE_P (translation_table))
5037     {
5038       int i;
5039       for (i = 0; i < 128; i++)
5040         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5041           break;
5042       if (i < 128)
5043         /* Some ASCII character should be translated.  We give up
5044            shrinking.  */
5045         return;
5046     }
5047
5048   if (coding->heading_ascii >= 0)
5049     /* Detection routine has already found how much we can skip at the
5050        head.  */
5051     *beg += coding->heading_ascii;
5052
5053   if (str)
5054     {
5055       begp_orig = begp = str + *beg;
5056       endp_orig = endp = str + *end;
5057     }
5058   else
5059     {
5060       begp_orig = begp = BYTE_POS_ADDR (*beg);
5061       endp_orig = endp = begp + *end - *beg;
5062     }
5063
5064   eol_conversion = (coding->eol_type == CODING_EOL_CR
5065                     || coding->eol_type == CODING_EOL_CRLF);
5066
5067   switch (coding->type)
5068     {
5069     case coding_type_sjis:
5070     case coding_type_big5:
5071       /* We can skip all ASCII characters at the head.  */
5072       if (coding->heading_ascii < 0)
5073         {
5074           if (eol_conversion)
5075             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5076           else
5077             while (begp < endp && *begp < 0x80) begp++;
5078         }
5079       /* We can skip all ASCII characters at the tail except for the
5080          second byte of SJIS or BIG5 code.  */
5081       if (eol_conversion)
5082         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5083       else
5084         while (begp < endp && endp[-1] < 0x80) endp--;
5085       /* Do not consider LF as ascii if preceded by CR, since that
5086          confuses eol decoding. */
5087       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5088         endp++;
5089       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5090         endp++;
5091       break;
5092
5093     case coding_type_iso2022:
5094       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5095         /* We can't skip any data.  */
5096         break;
5097       if (coding->heading_ascii < 0)
5098         {
5099           /* We can skip all ASCII characters at the head except for a
5100              few control codes.  */
5101           while (begp < endp && (c = *begp) < 0x80
5102                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5103                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5104                  && (!eol_conversion || c != ISO_CODE_LF))
5105             begp++;
5106         }
5107       switch (coding->category_idx)
5108         {
5109         case CODING_CATEGORY_IDX_ISO_8_1:
5110         case CODING_CATEGORY_IDX_ISO_8_2:
5111           /* We can skip all ASCII characters at the tail.  */
5112           if (eol_conversion)
5113             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5114           else
5115             while (begp < endp && endp[-1] < 0x80) endp--;
5116           /* Do not consider LF as ascii if preceded by CR, since that
5117              confuses eol decoding. */
5118           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5119             endp++;
5120           break;
5121
5122         case CODING_CATEGORY_IDX_ISO_7:
5123         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5124           {
5125             /* We can skip all characters at the tail except for 8-bit
5126                codes and ESC and the following 2-byte at the tail.  */
5127             unsigned char *eight_bit = NULL;
5128
5129             if (eol_conversion)
5130               while (begp < endp
5131                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5132                 {
5133                   if (!eight_bit && c & 0x80) eight_bit = endp;
5134                   endp--;
5135                 }
5136             else
5137               while (begp < endp
5138                      && (c = endp[-1]) != ISO_CODE_ESC)
5139                 {
5140                   if (!eight_bit && c & 0x80) eight_bit = endp;
5141                   endp--;
5142                 }
5143             /* Do not consider LF as ascii if preceded by CR, since that
5144                confuses eol decoding. */
5145             if (begp < endp && endp < endp_orig
5146                 && endp[-1] == '\r' && endp[0] == '\n')
5147               endp++;
5148             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5149               {
5150                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5151                   /* This is an ASCII designation sequence.  We can
5152                      surely skip the tail.  But, if we have
5153                      encountered an 8-bit code, skip only the codes
5154                      after that.  */
5155                   endp = eight_bit ? eight_bit : endp + 2;
5156                 else
5157                   /* Hmmm, we can't skip the tail.  */
5158                   endp = endp_orig;
5159               }
5160             else if (eight_bit)
5161               endp = eight_bit;
5162           }
5163         }
5164       break;
5165
5166     default:
5167       abort ();
5168     }
5169   *beg += begp - begp_orig;
5170   *end += endp - endp_orig;
5171   return;
5172 }
5173
5174 /* Like shrink_decoding_region but for encoding.  */
5175
5176 static void
5177 shrink_encoding_region (beg, end, coding, str)
5178      int *beg, *end;
5179      struct coding_system *coding;
5180      unsigned char *str;
5181 {
5182   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5183   int eol_conversion;
5184   Lisp_Object translation_table;
5185
5186   if (coding->type == coding_type_ccl
5187       || coding->eol_type == CODING_EOL_CRLF
5188       || coding->eol_type == CODING_EOL_CR
5189       || (coding->cmp_data && coding->cmp_data->used > 0))
5190     {
5191       /* We can't skip any data.  */
5192       return;
5193     }
5194   if (coding->type == coding_type_no_conversion
5195       || coding->type == coding_type_raw_text
5196       || coding->type == coding_type_emacs_mule
5197       || coding->type == coding_type_undecided)
5198     {
5199       /* We need no conversion, but don't have to skip any data here.
5200          Encoding routine handles them effectively anyway.  */
5201       return;
5202     }
5203
5204   translation_table = coding->translation_table_for_encode;
5205   if (NILP (translation_table) && !NILP (Venable_character_translation))
5206     translation_table = Vstandard_translation_table_for_encode;
5207   if (CHAR_TABLE_P (translation_table))
5208     {
5209       int i;
5210       for (i = 0; i < 128; i++)
5211         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5212           break;
5213       if (i < 128)
5214         /* Some ASCII character should be translated.  We give up
5215            shrinking.  */
5216         return;
5217     }
5218
5219   if (str)
5220     {
5221       begp_orig = begp = str + *beg;
5222       endp_orig = endp = str + *end;
5223     }
5224   else
5225     {
5226       begp_orig = begp = BYTE_POS_ADDR (*beg);
5227       endp_orig = endp = begp + *end - *beg;
5228     }
5229
5230   eol_conversion = (coding->eol_type == CODING_EOL_CR
5231                     || coding->eol_type == CODING_EOL_CRLF);
5232
5233   /* Here, we don't have to check coding->pre_write_conversion because
5234      the caller is expected to have handled it already.  */
5235   switch (coding->type)
5236     {
5237     case coding_type_iso2022:
5238       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5239         /* We can't skip any data.  */
5240         break;
5241       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5242         {
5243           unsigned char *bol = begp;
5244           while (begp < endp && *begp < 0x80)
5245             {
5246               begp++;
5247               if (begp[-1] == '\n')
5248                 bol = begp;
5249             }
5250           begp = bol;
5251           goto label_skip_tail;
5252         }
5253       /* fall down ... */
5254
5255     case coding_type_sjis:
5256     case coding_type_big5:
5257       /* We can skip all ASCII characters at the head and tail.  */
5258       if (eol_conversion)
5259         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5260       else
5261         while (begp < endp && *begp < 0x80) begp++;
5262     label_skip_tail:
5263       if (eol_conversion)
5264         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5265       else
5266         while (begp < endp && *(endp - 1) < 0x80) endp--;
5267       break;
5268
5269     default:
5270       abort ();
5271     }
5272
5273   *beg += begp - begp_orig;
5274   *end += endp - endp_orig;
5275   return;
5276 }
5277
5278 /* As shrinking conversion region requires some overhead, we don't try
5279    shrinking if the length of conversion region is less than this
5280    value.  */
5281 static int shrink_conversion_region_threshhold = 1024;
5282
5283 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5284   do {                                                                  \
5285     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5286       {                                                                 \
5287         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5288         else shrink_decoding_region (beg, end, coding, str);            \
5289       }                                                                 \
5290   } while (0)
5291
5292 static Lisp_Object
5293 code_convert_region_unwind (arg)
5294      Lisp_Object arg;
5295 {
5296   inhibit_pre_post_conversion = 0;
5297   Vlast_coding_system_used = arg;
5298   return Qnil;
5299 }
5300
5301 /* Store information about all compositions in the range FROM and TO
5302    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5303    buffer or a string, defaults to the current buffer.  */
5304
5305 void
5306 coding_save_composition (coding, from, to, obj)
5307      struct coding_system *coding;
5308      int from, to;
5309      Lisp_Object obj;
5310 {
5311   Lisp_Object prop;
5312   int start, end;
5313
5314   if (coding->composing == COMPOSITION_DISABLED)
5315     return;
5316   if (!coding->cmp_data)
5317     coding_allocate_composition_data (coding, from);
5318   if (!find_composition (from, to, &start, &end, &prop, obj)
5319       || end > to)
5320     return;
5321   if (start < from
5322       && (!find_composition (end, to, &start, &end, &prop, obj)
5323           || end > to))
5324     return;
5325   coding->composing = COMPOSITION_NO;
5326   do
5327     {
5328       if (COMPOSITION_VALID_P (start, end, prop))
5329         {
5330           enum composition_method method = COMPOSITION_METHOD (prop);
5331           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5332               >= COMPOSITION_DATA_SIZE)
5333             coding_allocate_composition_data (coding, from);
5334           /* For relative composition, we remember start and end
5335              positions, for the other compositions, we also remember
5336              components.  */
5337           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5338           if (method != COMPOSITION_RELATIVE)
5339             {
5340               /* We must store a*/
5341               Lisp_Object val, ch;
5342
5343               val = COMPOSITION_COMPONENTS (prop);
5344               if (CONSP (val))
5345                 while (CONSP (val))
5346                   {
5347                     ch = XCAR (val), val = XCDR (val);
5348                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5349                   }
5350               else if (VECTORP (val) || STRINGP (val))
5351                 {
5352                   int len = (VECTORP (val)
5353                              ? XVECTOR (val)->size : SCHARS (val));
5354                   int i;
5355                   for (i = 0; i < len; i++)
5356                     {
5357                       ch = (STRINGP (val)
5358                             ? Faref (val, make_number (i))
5359                             : XVECTOR (val)->contents[i]);
5360                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5361                     }
5362                 }
5363               else              /* INTEGERP (val) */
5364                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5365             }
5366           CODING_ADD_COMPOSITION_END (coding, end - from);
5367         }
5368       start = end;
5369     }
5370   while (start < to
5371          && find_composition (start, to, &start, &end, &prop, obj)
5372          && end <= to);
5373
5374   /* Make coding->cmp_data point to the first memory block.  */
5375   while (coding->cmp_data->prev)
5376     coding->cmp_data = coding->cmp_data->prev;
5377   coding->cmp_data_start = 0;
5378 }
5379
5380 /* Reflect the saved information about compositions to OBJ.
5381    CODING->cmp_data points to a memory block for the information.  OBJ
5382    is a buffer or a string, defaults to the current buffer.  */
5383
5384 void
5385 coding_restore_composition (coding, obj)
5386      struct coding_system *coding;
5387      Lisp_Object obj;
5388 {
5389   struct composition_data *cmp_data = coding->cmp_data;
5390
5391   if (!cmp_data)
5392     return;
5393
5394   while (cmp_data->prev)
5395     cmp_data = cmp_data->prev;
5396
5397   while (cmp_data)
5398     {
5399       int i;
5400
5401       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5402            i += cmp_data->data[i])
5403         {
5404           int *data = cmp_data->data + i;
5405           enum composition_method method = (enum composition_method) data[3];
5406           Lisp_Object components;
5407
5408           if (method == COMPOSITION_RELATIVE)
5409             components = Qnil;
5410           else
5411             {
5412               int len = data[0] - 4, j;
5413               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5414
5415               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5416                   && len % 2 == 0)
5417                 len --;
5418               for (j = 0; j < len; j++)
5419                 args[j] = make_number (data[4 + j]);
5420               components = (method == COMPOSITION_WITH_ALTCHARS
5421                             ? Fstring (len, args) : Fvector (len, args));
5422             }
5423           compose_text (data[1], data[2], components, Qnil, obj);
5424         }
5425       cmp_data = cmp_data->next;
5426     }
5427 }
5428
5429 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5430    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5431    coding system CODING, and return the status code of code conversion
5432    (currently, this value has no meaning).
5433
5434    How many characters (and bytes) are converted to how many
5435    characters (and bytes) are recorded in members of the structure
5436    CODING.
5437
5438    If REPLACE is nonzero, we do various things as if the original text
5439    is deleted and a new text is inserted.  See the comments in
5440    replace_range (insdel.c) to know what we are doing.
5441
5442    If REPLACE is zero, it is assumed that the source text is unibyte.
5443    Otherwise, it is assumed that the source text is multibyte.  */
5444
5445 int
5446 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5447      int from, from_byte, to, to_byte, encodep, replace;
5448      struct coding_system *coding;
5449 {
5450   int len = to - from, len_byte = to_byte - from_byte;
5451   int nchars_del = 0, nbytes_del = 0;
5452   int require, inserted, inserted_byte;
5453   int head_skip, tail_skip, total_skip = 0;
5454   Lisp_Object saved_coding_symbol;
5455   int first = 1;
5456   unsigned char *src, *dst;
5457   Lisp_Object deletion;
5458   int orig_point = PT, orig_len = len;
5459   int prev_Z;
5460   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5461
5462   deletion = Qnil;
5463   saved_coding_symbol = coding->symbol;
5464
5465   if (from < PT && PT < to)
5466     {
5467       TEMP_SET_PT_BOTH (from, from_byte);
5468       orig_point = from;
5469     }
5470
5471   if (replace)
5472     {
5473       int saved_from = from;
5474       int saved_inhibit_modification_hooks;
5475
5476       prepare_to_modify_buffer (from, to, &from);
5477       if (saved_from != from)
5478         {
5479           to = from + len;
5480           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5481           len_byte = to_byte - from_byte;
5482         }
5483
5484       /* The code conversion routine can not preserve text properties
5485          for now.  So, we must remove all text properties in the
5486          region.  Here, we must suppress all modification hooks.  */
5487       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5488       inhibit_modification_hooks = 1;
5489       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5490       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5491     }
5492
5493   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5494     {
5495       /* We must detect encoding of text and eol format.  */
5496
5497       if (from < GPT && to > GPT)
5498         move_gap_both (from, from_byte);
5499       if (coding->type == coding_type_undecided)
5500         {
5501           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5502           if (coding->type == coding_type_undecided)
5503             {
5504               /* It seems that the text contains only ASCII, but we
5505                  should not leave it undecided because the deeper
5506                  decoding routine (decode_coding) tries to detect the
5507                  encodings again in vain.  */
5508               coding->type = coding_type_emacs_mule;
5509               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5510               /* As emacs-mule decoder will handle composition, we
5511                  need this setting to allocate coding->cmp_data
5512                  later.  */
5513               coding->composing = COMPOSITION_NO;
5514             }
5515         }
5516       if (coding->eol_type == CODING_EOL_UNDECIDED
5517           && coding->type != coding_type_ccl)
5518         {
5519           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5520           if (coding->eol_type == CODING_EOL_UNDECIDED)
5521             coding->eol_type = CODING_EOL_LF;
5522           /* We had better recover the original eol format if we
5523              encounter an inconsistent eol format while decoding.  */
5524           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5525         }
5526     }
5527
5528   /* Now we convert the text.  */
5529
5530   /* For encoding, we must process pre-write-conversion in advance.  */
5531   if (! inhibit_pre_post_conversion
5532       && encodep
5533       && SYMBOLP (coding->pre_write_conversion)
5534       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5535     {
5536       /* The function in pre-write-conversion may put a new text in a
5537          new buffer.  */
5538       struct buffer *prev = current_buffer;
5539       Lisp_Object new;
5540
5541       record_unwind_protect (code_convert_region_unwind,
5542                              Vlast_coding_system_used);
5543       /* We should not call any more pre-write/post-read-conversion
5544          functions while this pre-write-conversion is running.  */
5545       inhibit_pre_post_conversion = 1;
5546       call2 (coding->pre_write_conversion,
5547              make_number (from), make_number (to));
5548       inhibit_pre_post_conversion = 0;
5549       /* Discard the unwind protect.  */
5550       specpdl_ptr--;
5551
5552       if (current_buffer != prev)
5553         {
5554           len = ZV - BEGV;
5555           new = Fcurrent_buffer ();
5556           set_buffer_internal_1 (prev);
5557           del_range_2 (from, from_byte, to, to_byte, 0);
5558           TEMP_SET_PT_BOTH (from, from_byte);
5559           insert_from_buffer (XBUFFER (new), 1, len, 0);
5560           Fkill_buffer (new);
5561           if (orig_point >= to)
5562             orig_point += len - orig_len;
5563           else if (orig_point > from)
5564             orig_point = from;
5565           orig_len = len;
5566           to = from + len;
5567           from_byte = CHAR_TO_BYTE (from);
5568           to_byte = CHAR_TO_BYTE (to);
5569           len_byte = to_byte - from_byte;
5570           TEMP_SET_PT_BOTH (from, from_byte);
5571         }
5572     }
5573
5574   if (replace)
5575     {
5576       if (! EQ (current_buffer->undo_list, Qt))
5577         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5578       else
5579         {
5580           nchars_del = to - from;
5581           nbytes_del = to_byte - from_byte;
5582         }
5583     }
5584
5585   if (coding->composing != COMPOSITION_DISABLED)
5586     {
5587       if (encodep)
5588         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5589       else
5590         coding_allocate_composition_data (coding, from);
5591     }
5592
5593   /* Try to skip the heading and tailing ASCIIs.  */
5594   if (coding->type != coding_type_ccl)
5595     {
5596       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5597
5598       if (from < GPT && GPT < to)
5599         move_gap_both (from, from_byte);
5600       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5601       if (from_byte == to_byte
5602           && (encodep || NILP (coding->post_read_conversion))
5603           && ! CODING_REQUIRE_FLUSHING (coding))
5604         {
5605           coding->produced = len_byte;
5606           coding->produced_char = len;
5607           if (!replace)
5608             /* We must record and adjust for this new text now.  */
5609             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5610           return 0;
5611         }
5612
5613       head_skip = from_byte - from_byte_orig;
5614       tail_skip = to_byte_orig - to_byte;
5615       total_skip = head_skip + tail_skip;
5616       from += head_skip;
5617       to -= tail_skip;
5618       len -= total_skip; len_byte -= total_skip;
5619     }
5620
5621   /* For conversion, we must put the gap before the text in addition to
5622      making the gap larger for efficient decoding.  The required gap
5623      size starts from 2000 which is the magic number used in make_gap.
5624      But, after one batch of conversion, it will be incremented if we
5625      find that it is not enough .  */
5626   require = 2000;
5627
5628   if (GAP_SIZE  < require)
5629     make_gap (require - GAP_SIZE);
5630   move_gap_both (from, from_byte);
5631
5632   inserted = inserted_byte = 0;
5633
5634   GAP_SIZE += len_byte;
5635   ZV -= len;
5636   Z -= len;
5637   ZV_BYTE -= len_byte;
5638   Z_BYTE -= len_byte;
5639
5640   if (GPT - BEG < BEG_UNCHANGED)
5641     BEG_UNCHANGED = GPT - BEG;
5642   if (Z - GPT < END_UNCHANGED)
5643     END_UNCHANGED = Z - GPT;
5644
5645   if (!encodep && coding->src_multibyte)
5646     {
5647       /* Decoding routines expects that the source text is unibyte.
5648          We must convert 8-bit characters of multibyte form to
5649          unibyte.  */
5650       int len_byte_orig = len_byte;
5651       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5652       if (len_byte < len_byte_orig)
5653         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5654                     len_byte);
5655       coding->src_multibyte = 0;
5656     }
5657
5658   for (;;)
5659     {
5660       int result;
5661
5662       /* The buffer memory is now:
5663          +--------+converted-text+---------+-------original-text-------+---+
5664          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5665                   |<---------------------- GAP ----------------------->|  */
5666       src = GAP_END_ADDR - len_byte;
5667       dst = GPT_ADDR + inserted_byte;
5668
5669       if (encodep)
5670         result = encode_coding (coding, src, dst, len_byte, 0);
5671       else
5672         {
5673           if (coding->composing != COMPOSITION_DISABLED)
5674             coding->cmp_data->char_offset = from + inserted;
5675           result = decode_coding (coding, src, dst, len_byte, 0);
5676         }
5677
5678       /* The buffer memory is now:
5679          +--------+-------converted-text----+--+------original-text----+---+
5680          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5681                   |<---------------------- GAP ----------------------->|  */
5682
5683       inserted += coding->produced_char;
5684       inserted_byte += coding->produced;
5685       len_byte -= coding->consumed;
5686
5687       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5688         {
5689           coding_allocate_composition_data (coding, from + inserted);
5690           continue;
5691         }
5692
5693       src += coding->consumed;
5694       dst += coding->produced;
5695
5696       if (result == CODING_FINISH_NORMAL)
5697         {
5698           src += len_byte;
5699           break;
5700         }
5701       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5702         {
5703           unsigned char *pend = dst, *p = pend - inserted_byte;
5704           Lisp_Object eol_type;
5705
5706           /* Encode LFs back to the original eol format (CR or CRLF).  */
5707           if (coding->eol_type == CODING_EOL_CR)
5708             {
5709               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5710             }
5711           else
5712             {
5713               int count = 0;
5714
5715               while (p < pend) if (*p++ == '\n') count++;
5716               if (src - dst < count)
5717                 {
5718                   /* We don't have sufficient room for encoding LFs
5719                      back to CRLF.  We must record converted and
5720                      not-yet-converted text back to the buffer
5721                      content, enlarge the gap, then record them out of
5722                      the buffer contents again.  */
5723                   int add = len_byte + inserted_byte;
5724
5725                   GAP_SIZE -= add;
5726                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5727                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5728                   make_gap (count - GAP_SIZE);
5729                   GAP_SIZE += add;
5730                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5731                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5732                   /* Don't forget to update SRC, DST, and PEND.  */
5733                   src = GAP_END_ADDR - len_byte;
5734                   dst = GPT_ADDR + inserted_byte;
5735                   pend = dst;
5736                 }
5737               inserted += count;
5738               inserted_byte += count;
5739               coding->produced += count;
5740               p = dst = pend + count;
5741               while (count)
5742                 {
5743                   *--p = *--pend;
5744                   if (*p == '\n') count--, *--p = '\r';
5745                 }
5746             }
5747
5748           /* Suppress eol-format conversion in the further conversion.  */
5749           coding->eol_type = CODING_EOL_LF;
5750
5751           /* Set the coding system symbol to that for Unix-like EOL.  */
5752           eol_type = Fget (saved_coding_symbol, Qeol_type);
5753           if (VECTORP (eol_type)
5754               && XVECTOR (eol_type)->size == 3
5755               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5756             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5757           else
5758             coding->symbol = saved_coding_symbol;
5759
5760           continue;
5761         }
5762       if (len_byte <= 0)
5763         {
5764           if (coding->type != coding_type_ccl
5765               || coding->mode & CODING_MODE_LAST_BLOCK)
5766             break;
5767           coding->mode |= CODING_MODE_LAST_BLOCK;
5768           continue;
5769         }
5770       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5771         {
5772           /* The source text ends in invalid codes.  Let's just
5773              make them valid buffer contents, and finish conversion.  */
5774           if (multibyte_p)
5775             {
5776               unsigned char *start = dst;
5777
5778               inserted += len_byte;
5779               while (len_byte--)
5780                 {
5781                   int c = *src++;
5782                   dst += CHAR_STRING (c, dst);
5783                 }
5784
5785               inserted_byte += dst - start;
5786             }
5787           else
5788             {
5789               inserted += len_byte;
5790               inserted_byte += len_byte;
5791               while (len_byte--)
5792                 *dst++ = *src++;
5793             }
5794           break;
5795         }
5796       if (result == CODING_FINISH_INTERRUPT)
5797         {
5798           /* The conversion procedure was interrupted by a user.  */
5799           break;
5800         }
5801       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5802       if (coding->consumed < 1)
5803         {
5804           /* It's quite strange to require more memory without
5805              consuming any bytes.  Perhaps CCL program bug.  */
5806           break;
5807         }
5808       if (first)
5809         {
5810           /* We have just done the first batch of conversion which was
5811              stopped because of insufficient gap.  Let's reconsider the
5812              required gap size (i.e. SRT - DST) now.
5813
5814              We have converted ORIG bytes (== coding->consumed) into
5815              NEW bytes (coding->produced).  To convert the remaining
5816              LEN bytes, we may need REQUIRE bytes of gap, where:
5817                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5818                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5819              Here, we are sure that NEW >= ORIG.  */
5820           float ratio;
5821
5822           if (coding->produced <= coding->consumed)
5823             {
5824               /* This happens because of CCL-based coding system with
5825                  eol-type CRLF.  */
5826               require = 0;
5827             }
5828           else
5829             {
5830               ratio = (coding->produced - coding->consumed) / coding->consumed;
5831               require = len_byte * ratio;
5832             }
5833           first = 0;
5834         }
5835       if ((src - dst) < (require + 2000))
5836         {
5837           /* See the comment above the previous call of make_gap.  */
5838           int add = len_byte + inserted_byte;
5839
5840           GAP_SIZE -= add;
5841           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5842           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5843           make_gap (require + 2000);
5844           GAP_SIZE += add;
5845           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5846           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5847         }
5848     }
5849   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5850
5851   if (encodep && coding->dst_multibyte)
5852     {
5853       /* The output is unibyte.  We must convert 8-bit characters to
5854          multibyte form.  */
5855       if (inserted_byte * 2 > GAP_SIZE)
5856         {
5857           GAP_SIZE -= inserted_byte;
5858           ZV += inserted_byte; Z += inserted_byte;
5859           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5860           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5861           make_gap (inserted_byte - GAP_SIZE);
5862           GAP_SIZE += inserted_byte;
5863           ZV -= inserted_byte; Z -= inserted_byte;
5864           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5865           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5866         }
5867       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5868     }
5869
5870   /* If we shrank the conversion area, adjust it now.  */
5871   if (total_skip > 0)
5872     {
5873       if (tail_skip > 0)
5874         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5875       inserted += total_skip; inserted_byte += total_skip;
5876       GAP_SIZE += total_skip;
5877       GPT -= head_skip; GPT_BYTE -= head_skip;
5878       ZV -= total_skip; ZV_BYTE -= total_skip;
5879       Z -= total_skip; Z_BYTE -= total_skip;
5880       from -= head_skip; from_byte -= head_skip;
5881       to += tail_skip; to_byte += tail_skip;
5882     }
5883
5884   prev_Z = Z;
5885   if (! EQ (current_buffer->undo_list, Qt))
5886     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5887   else
5888     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5889                                  inserted, inserted_byte);
5890   inserted = Z - prev_Z;
5891
5892   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5893     coding_restore_composition (coding, Fcurrent_buffer ());
5894   coding_free_composition_data (coding);
5895
5896   if (! inhibit_pre_post_conversion
5897       && ! encodep && ! NILP (coding->post_read_conversion))
5898     {
5899       Lisp_Object val;
5900       Lisp_Object saved_coding_system;
5901
5902       if (from != PT)
5903         TEMP_SET_PT_BOTH (from, from_byte);
5904       prev_Z = Z;
5905       record_unwind_protect (code_convert_region_unwind,
5906                              Vlast_coding_system_used);
5907       saved_coding_system = Vlast_coding_system_used;
5908       Vlast_coding_system_used = coding->symbol;
5909       /* We should not call any more pre-write/post-read-conversion
5910          functions while this post-read-conversion is running.  */
5911       inhibit_pre_post_conversion = 1;
5912       val = call1 (coding->post_read_conversion, make_number (inserted));
5913       inhibit_pre_post_conversion = 0;
5914       coding->symbol = Vlast_coding_system_used;
5915       Vlast_coding_system_used = saved_coding_system;
5916       /* Discard the unwind protect.  */
5917       specpdl_ptr--;
5918       CHECK_NUMBER (val);
5919       inserted += Z - prev_Z;
5920     }
5921
5922   if (orig_point >= from)
5923     {
5924       if (orig_point >= from + orig_len)
5925         orig_point += inserted - orig_len;
5926       else
5927         orig_point = from;
5928       TEMP_SET_PT (orig_point);
5929     }
5930
5931   if (replace)
5932     {
5933       signal_after_change (from, to - from, inserted);
5934       update_compositions (from, from + inserted, CHECK_BORDER);
5935     }
5936
5937   {
5938     coding->consumed = to_byte - from_byte;
5939     coding->consumed_char = to - from;
5940     coding->produced = inserted_byte;
5941     coding->produced_char = inserted;
5942   }
5943
5944   return 0;
5945 }
5946
5947 Lisp_Object
5948 run_pre_post_conversion_on_str (str, coding, encodep)
5949      Lisp_Object str;
5950      struct coding_system *coding;
5951      int encodep;
5952 {
5953   int count = SPECPDL_INDEX ();
5954   struct gcpro gcpro1, gcpro2;
5955   int multibyte = STRING_MULTIBYTE (str);
5956   Lisp_Object buffer;
5957   struct buffer *buf;
5958   Lisp_Object old_deactivate_mark;
5959
5960   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5961   record_unwind_protect (code_convert_region_unwind,
5962                          Vlast_coding_system_used);
5963   /* It is not crucial to specbind this.  */
5964   old_deactivate_mark = Vdeactivate_mark;
5965   GCPRO2 (str, old_deactivate_mark);
5966
5967   buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5968   buf = XBUFFER (buffer);
5969
5970   buf->directory = current_buffer->directory;
5971   buf->read_only = Qnil;
5972   buf->filename = Qnil;
5973   buf->undo_list = Qt;
5974   buf->overlays_before = Qnil;
5975   buf->overlays_after = Qnil;
5976
5977   set_buffer_internal (buf);
5978   /* We must insert the contents of STR as is without
5979      unibyte<->multibyte conversion.  For that, we adjust the
5980      multibyteness of the working buffer to that of STR.  */
5981   Ferase_buffer ();
5982   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
5983
5984   insert_from_string (str, 0, 0,
5985                       SCHARS (str), SBYTES (str), 0);
5986   UNGCPRO;
5987   inhibit_pre_post_conversion = 1;
5988   if (encodep)
5989     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5990   else
5991     {
5992       Vlast_coding_system_used = coding->symbol;
5993       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5994       call1 (coding->post_read_conversion, make_number (Z - BEG));
5995       coding->symbol = Vlast_coding_system_used;
5996     }
5997   inhibit_pre_post_conversion = 0;
5998   Vdeactivate_mark = old_deactivate_mark;
5999   str = make_buffer_string (BEG, Z, 1);
6000   return unbind_to (count, str);
6001 }
6002
6003 Lisp_Object
6004 decode_coding_string (str, coding, nocopy)
6005      Lisp_Object str;
6006      struct coding_system *coding;
6007      int nocopy;
6008 {
6009   int len;
6010   struct conversion_buffer buf;
6011   int from, to_byte;
6012   Lisp_Object saved_coding_symbol;
6013   int result;
6014   int require_decoding;
6015   int shrinked_bytes = 0;
6016   Lisp_Object newstr;
6017   int consumed, consumed_char, produced, produced_char;
6018
6019   from = 0;
6020   to_byte = SBYTES (str);
6021
6022   saved_coding_symbol = coding->symbol;
6023   coding->src_multibyte = STRING_MULTIBYTE (str);
6024   coding->dst_multibyte = 1;
6025   if (CODING_REQUIRE_DETECTION (coding))
6026     {
6027       /* See the comments in code_convert_region.  */
6028       if (coding->type == coding_type_undecided)
6029         {
6030           detect_coding (coding, SDATA (str), to_byte);
6031           if (coding->type == coding_type_undecided)
6032             {
6033               coding->type = coding_type_emacs_mule;
6034               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6035               /* As emacs-mule decoder will handle composition, we
6036                  need this setting to allocate coding->cmp_data
6037                  later.  */
6038               coding->composing = COMPOSITION_NO;
6039             }
6040         }
6041       if (coding->eol_type == CODING_EOL_UNDECIDED
6042           && coding->type != coding_type_ccl)
6043         {
6044           saved_coding_symbol = coding->symbol;
6045           detect_eol (coding, SDATA (str), to_byte);
6046           if (coding->eol_type == CODING_EOL_UNDECIDED)
6047             coding->eol_type = CODING_EOL_LF;
6048           /* We had better recover the original eol format if we
6049              encounter an inconsistent eol format while decoding.  */
6050           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6051         }
6052     }
6053
6054   if (coding->type == coding_type_no_conversion
6055       || coding->type == coding_type_raw_text)
6056     coding->dst_multibyte = 0;
6057
6058   require_decoding = CODING_REQUIRE_DECODING (coding);
6059
6060   if (STRING_MULTIBYTE (str))
6061     {
6062       /* Decoding routines expect the source text to be unibyte.  */
6063       str = Fstring_as_unibyte (str);
6064       to_byte = SBYTES (str);
6065       nocopy = 1;
6066       coding->src_multibyte = 0;
6067     }
6068
6069   /* Try to skip the heading and tailing ASCIIs.  */
6070   if (require_decoding && coding->type != coding_type_ccl)
6071     {
6072       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6073                                 0);
6074       if (from == to_byte)
6075         require_decoding = 0;
6076       shrinked_bytes = from + (SBYTES (str) - to_byte);
6077     }
6078
6079   if (!require_decoding
6080       && !(SYMBOLP (coding->post_read_conversion)
6081            && !NILP (Ffboundp (coding->post_read_conversion))))
6082     {
6083       coding->consumed = SBYTES (str);
6084       coding->consumed_char = SCHARS (str);
6085       if (coding->dst_multibyte)
6086         {
6087           str = Fstring_as_multibyte (str);
6088           nocopy = 1;
6089         }
6090       coding->produced = SBYTES (str);
6091       coding->produced_char = SCHARS (str);
6092       return (nocopy ? str : Fcopy_sequence (str));
6093     }
6094
6095   if (coding->composing != COMPOSITION_DISABLED)
6096     coding_allocate_composition_data (coding, from);
6097   len = decoding_buffer_size (coding, to_byte - from);
6098   allocate_conversion_buffer (buf, len);
6099
6100   consumed = consumed_char = produced = produced_char = 0;
6101   while (1)
6102     {
6103       result = decode_coding (coding, SDATA (str) + from + consumed,
6104                               buf.data + produced, to_byte - from - consumed,
6105                               buf.size - produced);
6106       consumed += coding->consumed;
6107       consumed_char += coding->consumed_char;
6108       produced += coding->produced;
6109       produced_char += coding->produced_char;
6110       if (result == CODING_FINISH_NORMAL
6111           || (result == CODING_FINISH_INSUFFICIENT_SRC
6112               && coding->consumed == 0))
6113         break;
6114       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6115         coding_allocate_composition_data (coding, from + produced_char);
6116       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6117         extend_conversion_buffer (&buf);
6118       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6119         {
6120           Lisp_Object eol_type;
6121
6122           /* Recover the original EOL format.  */
6123           if (coding->eol_type == CODING_EOL_CR)
6124             {
6125               unsigned char *p;
6126               for (p = buf.data; p < buf.data + produced; p++)
6127                 if (*p == '\n') *p = '\r';
6128             }
6129           else if (coding->eol_type == CODING_EOL_CRLF)
6130             {
6131               int num_eol = 0;
6132               unsigned char *p0, *p1;
6133               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6134                 if (*p0 == '\n') num_eol++;
6135               if (produced + num_eol >= buf.size)
6136                 extend_conversion_buffer (&buf);
6137               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6138                 {
6139                   *--p1 = *--p0;
6140                   if (*p0 == '\n') *--p1 = '\r';
6141                 }
6142               produced += num_eol;
6143               produced_char += num_eol;
6144             }
6145           /* Suppress eol-format conversion in the further conversion.  */
6146           coding->eol_type = CODING_EOL_LF;
6147
6148           /* Set the coding system symbol to that for Unix-like EOL.  */
6149           eol_type = Fget (saved_coding_symbol, Qeol_type);
6150           if (VECTORP (eol_type)
6151               && XVECTOR (eol_type)->size == 3
6152               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6153             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6154           else
6155             coding->symbol = saved_coding_symbol;
6156
6157
6158         }
6159     }
6160
6161   coding->consumed = consumed;
6162   coding->consumed_char = consumed_char;
6163   coding->produced = produced;
6164   coding->produced_char = produced_char;
6165
6166   if (coding->dst_multibyte)
6167     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6168                                            produced + shrinked_bytes);
6169   else
6170     newstr = make_uninit_string (produced + shrinked_bytes);
6171   if (from > 0)
6172     STRING_COPYIN (newstr, 0, SDATA (str), from);
6173   STRING_COPYIN (newstr, from, buf.data, produced);
6174   if (shrinked_bytes > from)
6175     STRING_COPYIN (newstr, from + produced,
6176                    SDATA (str) + to_byte,
6177                    shrinked_bytes - from);
6178   free_conversion_buffer (&buf);
6179
6180   if (coding->cmp_data && coding->cmp_data->used)
6181     coding_restore_composition (coding, newstr);
6182   coding_free_composition_data (coding);
6183
6184   if (SYMBOLP (coding->post_read_conversion)
6185       && !NILP (Ffboundp (coding->post_read_conversion)))
6186     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6187
6188   return newstr;
6189 }
6190
6191 Lisp_Object
6192 encode_coding_string (str, coding, nocopy)
6193      Lisp_Object str;
6194      struct coding_system *coding;
6195      int nocopy;
6196 {
6197   int len;
6198   struct conversion_buffer buf;
6199   int from, to, to_byte;
6200   int result;
6201   int shrinked_bytes = 0;
6202   Lisp_Object newstr;
6203   int consumed, consumed_char, produced, produced_char;
6204
6205   if (SYMBOLP (coding->pre_write_conversion)
6206       && !NILP (Ffboundp (coding->pre_write_conversion)))
6207     str = run_pre_post_conversion_on_str (str, coding, 1);
6208
6209   from = 0;
6210   to = SCHARS (str);
6211   to_byte = SBYTES (str);
6212
6213   /* Encoding routines determine the multibyteness of the source text
6214      by coding->src_multibyte.  */
6215   coding->src_multibyte = STRING_MULTIBYTE (str);
6216   coding->dst_multibyte = 0;
6217   if (! CODING_REQUIRE_ENCODING (coding))
6218     {
6219       coding->consumed = SBYTES (str);
6220       coding->consumed_char = SCHARS (str);
6221       if (STRING_MULTIBYTE (str))
6222         {
6223           str = Fstring_as_unibyte (str);
6224           nocopy = 1;
6225         }
6226       coding->produced = SBYTES (str);
6227       coding->produced_char = SCHARS (str);
6228       return (nocopy ? str : Fcopy_sequence (str));
6229     }
6230
6231   if (coding->composing != COMPOSITION_DISABLED)
6232     coding_save_composition (coding, from, to, str);
6233
6234   /* Try to skip the heading and tailing ASCIIs.  */
6235   if (coding->type != coding_type_ccl)
6236     {
6237       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6238                                 1);
6239       if (from == to_byte)
6240         return (nocopy ? str : Fcopy_sequence (str));
6241       shrinked_bytes = from + (SBYTES (str) - to_byte);
6242     }
6243
6244   len = encoding_buffer_size (coding, to_byte - from);
6245   allocate_conversion_buffer (buf, len);
6246
6247   consumed = consumed_char = produced = produced_char = 0;
6248   while (1)
6249     {
6250       result = encode_coding (coding, SDATA (str) + from + consumed,
6251                               buf.data + produced, to_byte - from - consumed,
6252                               buf.size - produced);
6253       consumed += coding->consumed;
6254       consumed_char += coding->consumed_char;
6255       produced += coding->produced;
6256       produced_char += coding->produced_char;
6257       if (result == CODING_FINISH_NORMAL
6258           || (result == CODING_FINISH_INSUFFICIENT_SRC
6259               && coding->consumed == 0))
6260         break;
6261       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6262       extend_conversion_buffer (&buf);
6263     }
6264
6265   coding->consumed = consumed;
6266   coding->consumed_char = consumed_char;
6267   coding->produced = produced;
6268   coding->produced_char = produced_char;
6269
6270   newstr = make_uninit_string (produced + shrinked_bytes);
6271   if (from > 0)
6272     STRING_COPYIN (newstr, 0, SDATA (str), from);
6273   STRING_COPYIN (newstr, from, buf.data, produced);
6274   if (shrinked_bytes > from)
6275     STRING_COPYIN (newstr, from + produced,
6276                    SDATA (str) + to_byte,
6277                    shrinked_bytes - from);
6278
6279   free_conversion_buffer (&buf);
6280   coding_free_composition_data (coding);
6281
6282   return newstr;
6283 }
6284
6285 \f
6286 #ifdef emacs
6287 /*** 8. Emacs Lisp library functions ***/
6288
6289 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6290        doc: /* Return t if OBJECT is nil or a coding-system.
6291 See the documentation of `make-coding-system' for information
6292 about coding-system objects.  */)
6293      (obj)
6294      Lisp_Object obj;
6295 {
6296   if (NILP (obj))
6297     return Qt;
6298   if (!SYMBOLP (obj))
6299     return Qnil;
6300   /* Get coding-spec vector for OBJ.  */
6301   obj = Fget (obj, Qcoding_system);
6302   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6303           ? Qt : Qnil);
6304 }
6305
6306 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6307        Sread_non_nil_coding_system, 1, 1, 0,
6308        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6309      (prompt)
6310      Lisp_Object prompt;
6311 {
6312   Lisp_Object val;
6313   do
6314     {
6315       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6316                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6317     }
6318   while (SCHARS (val) == 0);
6319   return (Fintern (val, Qnil));
6320 }
6321
6322 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6323        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6324 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6325      (prompt, default_coding_system)
6326      Lisp_Object prompt, default_coding_system;
6327 {
6328   Lisp_Object val;
6329   if (SYMBOLP (default_coding_system))
6330     default_coding_system = SYMBOL_NAME (default_coding_system);
6331   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6332                           Qt, Qnil, Qcoding_system_history,
6333                           default_coding_system, Qnil);
6334   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6335 }
6336
6337 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6338        1, 1, 0,
6339        doc: /* Check validity of CODING-SYSTEM.
6340 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6341 It is valid if it is a symbol with a non-nil `coding-system' property.
6342 The value of property should be a vector of length 5.  */)
6343      (coding_system)
6344      Lisp_Object coding_system;
6345 {
6346   CHECK_SYMBOL (coding_system);
6347   if (!NILP (Fcoding_system_p (coding_system)))
6348     return coding_system;
6349   while (1)
6350     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6351 }
6352 \f
6353 Lisp_Object
6354 detect_coding_system (src, src_bytes, highest, multibytep)
6355      const unsigned char *src;
6356      int src_bytes, highest;
6357      int multibytep;
6358 {
6359   int coding_mask, eol_type;
6360   Lisp_Object val, tmp;
6361   int dummy;
6362
6363   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6364   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6365   if (eol_type == CODING_EOL_INCONSISTENT)
6366     eol_type = CODING_EOL_UNDECIDED;
6367
6368   if (!coding_mask)
6369     {
6370       val = Qundecided;
6371       if (eol_type != CODING_EOL_UNDECIDED)
6372         {
6373           Lisp_Object val2;
6374           val2 = Fget (Qundecided, Qeol_type);
6375           if (VECTORP (val2))
6376             val = XVECTOR (val2)->contents[eol_type];
6377         }
6378       return (highest ? val : Fcons (val, Qnil));
6379     }
6380
6381   /* At first, gather possible coding systems in VAL.  */
6382   val = Qnil;
6383   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6384     {
6385       Lisp_Object category_val, category_index;
6386
6387       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6388       category_val = Fsymbol_value (XCAR (tmp));
6389       if (!NILP (category_val)
6390           && NATNUMP (category_index)
6391           && (coding_mask & (1 << XFASTINT (category_index))))
6392         {
6393           val = Fcons (category_val, val);
6394           if (highest)
6395             break;
6396         }
6397     }
6398   if (!highest)
6399     val = Fnreverse (val);
6400
6401   /* Then, replace the elements with subsidiary coding systems.  */
6402   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6403     {
6404       if (eol_type != CODING_EOL_UNDECIDED
6405           && eol_type != CODING_EOL_INCONSISTENT)
6406         {
6407           Lisp_Object eol;
6408           eol = Fget (XCAR (tmp), Qeol_type);
6409           if (VECTORP (eol))
6410             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6411         }
6412     }
6413   return (highest ? XCAR (val) : val);
6414 }
6415
6416 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6417        2, 3, 0,
6418        doc: /* Detect how the byte sequence in the region is encoded.
6419 Return a list of possible coding systems used on decoding a byte
6420 sequence containing the bytes in the region between START and END when
6421 the coding system `undecided' is specified.  The list is ordered by
6422 priority decided in the current language environment.
6423
6424 If only ASCII characters are found, it returns a list of single element
6425 `undecided' or its subsidiary coding system according to a detected
6426 end-of-line format.
6427
6428 If optional argument HIGHEST is non-nil, return the coding system of
6429 highest priority.  */)
6430      (start, end, highest)
6431      Lisp_Object start, end, highest;
6432 {
6433   int from, to;
6434   int from_byte, to_byte;
6435   int include_anchor_byte = 0;
6436
6437   CHECK_NUMBER_COERCE_MARKER (start);
6438   CHECK_NUMBER_COERCE_MARKER (end);
6439
6440   validate_region (&start, &end);
6441   from = XINT (start), to = XINT (end);
6442   from_byte = CHAR_TO_BYTE (from);
6443   to_byte = CHAR_TO_BYTE (to);
6444
6445   if (from < GPT && to >= GPT)
6446     move_gap_both (to, to_byte);
6447   /* If we an anchor byte `\0' follows the region, we include it in
6448      the detecting source.  Then code detectors can handle the tailing
6449      byte sequence more accurately.
6450
6451      Fix me: This is not a perfect solution.  It is better that we
6452      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6453   */
6454   if (to == Z || (to == GPT && GAP_SIZE > 0))
6455     include_anchor_byte = 1;
6456   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6457                                to_byte - from_byte + include_anchor_byte,
6458                                !NILP (highest),
6459                                !NILP (current_buffer
6460                                       ->enable_multibyte_characters));
6461 }
6462
6463 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6464        1, 2, 0,
6465        doc: /* Detect how the byte sequence in STRING is encoded.
6466 Return a list of possible coding systems used on decoding a byte
6467 sequence containing the bytes in STRING when the coding system
6468 `undecided' is specified.  The list is ordered by priority decided in
6469 the current language environment.
6470
6471 If only ASCII characters are found, it returns a list of single element
6472 `undecided' or its subsidiary coding system according to a detected
6473 end-of-line format.
6474
6475 If optional argument HIGHEST is non-nil, return the coding system of
6476 highest priority.  */)
6477      (string, highest)
6478      Lisp_Object string, highest;
6479 {
6480   CHECK_STRING (string);
6481
6482   return detect_coding_system (SDATA (string),
6483                                /* "+ 1" is to include the anchor byte
6484                                   `\0'.  With this, code detectors can
6485                                   handle the tailing bytes more
6486                                   accurately.  */
6487                                SBYTES (string) + 1,
6488                                !NILP (highest),
6489                                STRING_MULTIBYTE (string));
6490 }
6491
6492 /*  Subroutine for Fsafe_coding_systems_region_internal.
6493
6494     Return a list of coding systems that safely encode the multibyte
6495     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6496     possible coding systems.  If it is nil, it means that we have not
6497     yet found any coding systems.
6498
6499     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
6500     element of WORK_TABLE is set to t once the element is looked up.
6501
6502     If a non-ASCII single byte char is found, set
6503     *single_byte_char_found to 1.  */
6504
6505 static Lisp_Object
6506 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6507      unsigned char *p, *pend;
6508      Lisp_Object safe_codings, work_table;
6509      int *single_byte_char_found;
6510 {
6511   int c, len;
6512   Lisp_Object val, ch;
6513   Lisp_Object prev, tail;
6514
6515   while (p < pend)
6516     {
6517       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6518       p += len;
6519       if (ASCII_BYTE_P (c))
6520         /* We can ignore ASCII characters here.  */
6521         continue;
6522       if (SINGLE_BYTE_CHAR_P (c))
6523         *single_byte_char_found = 1;
6524       if (NILP (safe_codings))
6525         /* Already all coding systems are excluded.  But, we can't
6526            terminate the loop here because non-ASCII single-byte char
6527            must be found.  */
6528         continue;
6529       /* Check the safe coding systems for C.  */
6530       ch = make_number (c);
6531       val = Faref (work_table, ch);
6532       if (EQ (val, Qt))
6533         /* This element was already checked.  Ignore it.  */
6534         continue;
6535       /* Remember that we checked this element.  */
6536       Faset (work_table, ch, Qt);
6537
6538       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6539         {
6540           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6541           int encodable;
6542
6543           elt = XCAR (tail);
6544           if (CONSP (XCDR (elt)))
6545             {
6546               /* This entry has this format now:
6547                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6548                           ACCEPT-LATIN-EXTRA ) */
6549               val = XCDR (elt);
6550               encodable = ! NILP (Faref (XCAR (val), ch));
6551               if (! encodable)
6552                 {
6553                   val = XCDR (val);
6554                   translation_table = XCAR (val);
6555                   hash_table = XCAR (XCDR (val));
6556                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6557                 }
6558             }
6559           else
6560             {
6561               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6562               encodable = ! NILP (Faref (XCDR (elt), ch));
6563               if (! encodable)
6564                 {
6565                   /* Transform the format to:
6566                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6567                        ACCEPT-LATIN-EXTRA )  */
6568                   val = Fget (XCAR (elt), Qcoding_system);
6569                   translation_table
6570                     = Fplist_get (AREF (val, 3),
6571                                   Qtranslation_table_for_encode);
6572                   if (SYMBOLP (translation_table))
6573                     translation_table = Fget (translation_table,
6574                                               Qtranslation_table);
6575                   hash_table
6576                     = (CHAR_TABLE_P (translation_table)
6577                        ? XCHAR_TABLE (translation_table)->extras[1]
6578                        : Qnil);
6579                   accept_latin_extra
6580                     = ((EQ (AREF (val, 0), make_number (2))
6581                         && VECTORP (AREF (val, 4)))
6582                        ? AREF (AREF (val, 4), 16)
6583                        : Qnil);
6584                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6585                                         translation_table, hash_table,
6586                                         accept_latin_extra));
6587                 }
6588             }
6589
6590           if (! encodable
6591               && ((CHAR_TABLE_P (translation_table)
6592                    && ! NILP (Faref (translation_table, ch)))
6593                   || (HASH_TABLE_P (hash_table)
6594                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6595                   || (SINGLE_BYTE_CHAR_P (c)
6596                       && ! NILP (accept_latin_extra)
6597                       && VECTORP (Vlatin_extra_code_table)
6598                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6599             encodable = 1;
6600           if (encodable)
6601             prev = tail;
6602           else
6603             {
6604               /* Exclude this coding system from SAFE_CODINGS.  */
6605               if (EQ (tail, safe_codings))
6606                 safe_codings = XCDR (safe_codings);
6607               else
6608                 XSETCDR (prev, XCDR (tail));
6609             }
6610         }
6611     }
6612   return safe_codings;
6613 }
6614
6615 DEFUN ("find-coding-systems-region-internal",
6616        Ffind_coding_systems_region_internal,
6617        Sfind_coding_systems_region_internal, 2, 2, 0,
6618        doc: /* Internal use only.  */)
6619      (start, end)
6620      Lisp_Object start, end;
6621 {
6622   Lisp_Object work_table, safe_codings;
6623   int non_ascii_p = 0;
6624   int single_byte_char_found = 0;
6625   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6626
6627   if (STRINGP (start))
6628     {
6629       if (!STRING_MULTIBYTE (start))
6630         return Qt;
6631       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6632       p2 = p2end = p1end;
6633       if (SCHARS (start) != SBYTES (start))
6634         non_ascii_p = 1;
6635     }
6636   else
6637     {
6638       int from, to, stop;
6639
6640       CHECK_NUMBER_COERCE_MARKER (start);
6641       CHECK_NUMBER_COERCE_MARKER (end);
6642       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6643         args_out_of_range (start, end);
6644       if (NILP (current_buffer->enable_multibyte_characters))
6645         return Qt;
6646       from = CHAR_TO_BYTE (XINT (start));
6647       to = CHAR_TO_BYTE (XINT (end));
6648       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6649       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6650       if (stop == to)
6651         p2 = p2end = p1end;
6652       else
6653         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6654       if (XINT (end) - XINT (start) != to - from)
6655         non_ascii_p = 1;
6656     }
6657
6658   if (!non_ascii_p)
6659     {
6660       /* We are sure that the text contains no multibyte character.
6661          Check if it contains eight-bit-graphic.  */
6662       p = p1;
6663       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6664       if (p == p1end)
6665         {
6666           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6667           if (p == p2end)
6668             return Qt;
6669         }
6670     }
6671
6672   /* The text contains non-ASCII characters.  */
6673
6674   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6675   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6676
6677   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6678                                     &single_byte_char_found);
6679   if (p2 < p2end)
6680     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6681                                       &single_byte_char_found);
6682   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6683     safe_codings = Qt;
6684   else
6685     {
6686       /* Turn safe_codings to a list of coding systems... */
6687       Lisp_Object val;
6688
6689       if (single_byte_char_found)
6690         /* ... and append these for eight-bit chars.  */
6691         val = Fcons (Qraw_text,
6692                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6693       else
6694         /* ... and append generic coding systems.  */
6695         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6696
6697       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6698         val = Fcons (XCAR (XCAR (safe_codings)), val);
6699       safe_codings = val;
6700     }
6701
6702   return safe_codings;
6703 }
6704
6705
6706 /* Search from position POS for such characters that are unencodable
6707    accoding to SAFE_CHARS, and return a list of their positions.  P
6708    points where in the memory the character at POS exists.  Limit the
6709    search at PEND or when Nth unencodable characters are found.
6710
6711    If SAFE_CHARS is a char table, an element for an unencodable
6712    character is nil.
6713
6714    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6715
6716    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6717    eight-bit-graphic characters are unencodable.  */
6718
6719 static Lisp_Object
6720 unencodable_char_position (safe_chars, pos, p, pend, n)
6721      Lisp_Object safe_chars;
6722      int pos;
6723      unsigned char *p, *pend;
6724      int n;
6725 {
6726   Lisp_Object pos_list;
6727
6728   pos_list = Qnil;
6729   while (p < pend)
6730     {
6731       int len;
6732       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6733
6734       if (c >= 128
6735           && (CHAR_TABLE_P (safe_chars)
6736               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6737               : (NILP (safe_chars) || c < 256)))
6738         {
6739           pos_list = Fcons (make_number (pos), pos_list);
6740           if (--n <= 0)
6741             break;
6742         }
6743       pos++;
6744       p += len;
6745     }
6746   return Fnreverse (pos_list);
6747 }
6748
6749
6750 DEFUN ("unencodable-char-position", Funencodable_char_position,
6751        Sunencodable_char_position, 3, 5, 0,
6752        doc: /*
6753 Return position of first un-encodable character in a region.
6754 START and END specfiy the region and CODING-SYSTEM specifies the
6755 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6756
6757 If optional 4th argument COUNT is non-nil, it specifies at most how
6758 many un-encodable characters to search.  In this case, the value is a
6759 list of positions.
6760
6761 If optional 5th argument STRING is non-nil, it is a string to search
6762 for un-encodable characters.  In that case, START and END are indexes
6763 to the string.  */)
6764      (start, end, coding_system, count, string)
6765      Lisp_Object start, end, coding_system, count, string;
6766 {
6767   int n;
6768   Lisp_Object safe_chars;
6769   struct coding_system coding;
6770   Lisp_Object positions;
6771   int from, to;
6772   unsigned char *p, *pend;
6773
6774   if (NILP (string))
6775     {
6776       validate_region (&start, &end);
6777       from = XINT (start);
6778       to = XINT (end);
6779       if (NILP (current_buffer->enable_multibyte_characters))
6780         return Qnil;
6781       p = CHAR_POS_ADDR (from);
6782       if (to == GPT)
6783         pend = GPT_ADDR;
6784       else
6785         pend = CHAR_POS_ADDR (to);
6786     }
6787   else
6788     {
6789       CHECK_STRING (string);
6790       CHECK_NATNUM (start);
6791       CHECK_NATNUM (end);
6792       from = XINT (start);
6793       to = XINT (end);
6794       if (from > to
6795           || to > SCHARS (string))
6796         args_out_of_range_3 (string, start, end);
6797       if (! STRING_MULTIBYTE (string))
6798         return Qnil;
6799       p = SDATA (string) + string_char_to_byte (string, from);
6800       pend = SDATA (string) + string_char_to_byte (string, to);
6801     }
6802
6803   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6804
6805   if (NILP (count))
6806     n = 1;
6807   else
6808     {
6809       CHECK_NATNUM (count);
6810       n = XINT (count);
6811     }
6812
6813   if (coding.type == coding_type_no_conversion
6814       || coding.type == coding_type_raw_text)
6815     return Qnil;
6816
6817   if (coding.type == coding_type_undecided)
6818     safe_chars = Qnil;
6819   else
6820     safe_chars = coding_safe_chars (coding_system);
6821
6822   if (STRINGP (string)
6823       || from >= GPT || to <= GPT)
6824     positions = unencodable_char_position (safe_chars, from, p, pend, n);
6825   else
6826     {
6827       Lisp_Object args[2];
6828
6829       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6830       n -= XINT (Flength (args[0]));
6831       if (n <= 0)
6832         positions = args[0];
6833       else
6834         {
6835           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6836                                                pend, n);
6837           positions = Fappend (2, args);
6838         }
6839     }
6840
6841   return  (NILP (count) ? Fcar (positions) : positions);
6842 }
6843
6844
6845 Lisp_Object
6846 code_convert_region1 (start, end, coding_system, encodep)
6847      Lisp_Object start, end, coding_system;
6848      int encodep;
6849 {
6850   struct coding_system coding;
6851   int from, to;
6852
6853   CHECK_NUMBER_COERCE_MARKER (start);
6854   CHECK_NUMBER_COERCE_MARKER (end);
6855   CHECK_SYMBOL (coding_system);
6856
6857   validate_region (&start, &end);
6858   from = XFASTINT (start);
6859   to = XFASTINT (end);
6860
6861   if (NILP (coding_system))
6862     return make_number (to - from);
6863
6864   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6865     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6866
6867   coding.mode |= CODING_MODE_LAST_BLOCK;
6868   coding.src_multibyte = coding.dst_multibyte
6869     = !NILP (current_buffer->enable_multibyte_characters);
6870   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6871                        &coding, encodep, 1);
6872   Vlast_coding_system_used = coding.symbol;
6873   return make_number (coding.produced_char);
6874 }
6875
6876 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6877        3, 3, "r\nzCoding system: ",
6878        doc: /* Decode the current region from the specified coding system.
6879 When called from a program, takes three arguments:
6880 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6881 This function sets `last-coding-system-used' to the precise coding system
6882 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6883 not fully specified.)
6884 It returns the length of the decoded text.  */)
6885      (start, end, coding_system)
6886      Lisp_Object start, end, coding_system;
6887 {
6888   return code_convert_region1 (start, end, coding_system, 0);
6889 }
6890
6891 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6892        3, 3, "r\nzCoding system: ",
6893        doc: /* Encode the current region into the specified coding system.
6894 When called from a program, takes three arguments:
6895 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6896 This function sets `last-coding-system-used' to the precise coding system
6897 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6898 not fully specified.)
6899 It returns the length of the encoded text.  */)
6900      (start, end, coding_system)
6901      Lisp_Object start, end, coding_system;
6902 {
6903   return code_convert_region1 (start, end, coding_system, 1);
6904 }
6905
6906 Lisp_Object
6907 code_convert_string1 (string, coding_system, nocopy, encodep)
6908      Lisp_Object string, coding_system, nocopy;
6909      int encodep;
6910 {
6911   struct coding_system coding;
6912
6913   CHECK_STRING (string);
6914   CHECK_SYMBOL (coding_system);
6915
6916   if (NILP (coding_system))
6917     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6918
6919   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6920     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6921
6922   coding.mode |= CODING_MODE_LAST_BLOCK;
6923   string = (encodep
6924             ? encode_coding_string (string, &coding, !NILP (nocopy))
6925             : decode_coding_string (string, &coding, !NILP (nocopy)));
6926   Vlast_coding_system_used = coding.symbol;
6927
6928   return string;
6929 }
6930
6931 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6932        2, 3, 0,
6933        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6934 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6935 if the decoding operation is trivial.
6936 This function sets `last-coding-system-used' to the precise coding system
6937 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6938 not fully specified.)  */)
6939      (string, coding_system, nocopy)
6940      Lisp_Object string, coding_system, nocopy;
6941 {
6942   return code_convert_string1 (string, coding_system, nocopy, 0);
6943 }
6944
6945 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6946        2, 3, 0,
6947        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6948 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6949 if the encoding operation is trivial.
6950 This function sets `last-coding-system-used' to the precise coding system
6951 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6952 not fully specified.)  */)
6953      (string, coding_system, nocopy)
6954      Lisp_Object string, coding_system, nocopy;
6955 {
6956   return code_convert_string1 (string, coding_system, nocopy, 1);
6957 }
6958
6959 /* Encode or decode STRING according to CODING_SYSTEM.
6960    Do not set Vlast_coding_system_used.
6961
6962    This function is called only from macros DECODE_FILE and
6963    ENCODE_FILE, thus we ignore character composition.  */
6964
6965 Lisp_Object
6966 code_convert_string_norecord (string, coding_system, encodep)
6967      Lisp_Object string, coding_system;
6968      int encodep;
6969 {
6970   struct coding_system coding;
6971
6972   CHECK_STRING (string);
6973   CHECK_SYMBOL (coding_system);
6974
6975   if (NILP (coding_system))
6976     return string;
6977
6978   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6979     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6980
6981   coding.composing = COMPOSITION_DISABLED;
6982   coding.mode |= CODING_MODE_LAST_BLOCK;
6983   return (encodep
6984           ? encode_coding_string (string, &coding, 1)
6985           : decode_coding_string (string, &coding, 1));
6986 }
6987 \f
6988 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6989        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6990 Return the corresponding character.  */)
6991      (code)
6992      Lisp_Object code;
6993 {
6994   unsigned char c1, c2, s1, s2;
6995   Lisp_Object val;
6996
6997   CHECK_NUMBER (code);
6998   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6999   if (s1 == 0)
7000     {
7001       if (s2 < 0x80)
7002         XSETFASTINT (val, s2);
7003       else if (s2 >= 0xA0 || s2 <= 0xDF)
7004         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7005       else
7006         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7007     }
7008   else
7009     {
7010       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7011           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7012         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7013       DECODE_SJIS (s1, s2, c1, c2);
7014       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7015     }
7016   return val;
7017 }
7018
7019 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7020        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7021 Return the corresponding code in SJIS.  */)
7022      (ch)
7023      Lisp_Object ch;
7024 {
7025   int charset, c1, c2, s1, s2;
7026   Lisp_Object val;
7027
7028   CHECK_NUMBER (ch);
7029   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7030   if (charset == CHARSET_ASCII)
7031     {
7032       val = ch;
7033     }
7034   else if (charset == charset_jisx0208
7035            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7036     {
7037       ENCODE_SJIS (c1, c2, s1, s2);
7038       XSETFASTINT (val, (s1 << 8) | s2);
7039     }
7040   else if (charset == charset_katakana_jisx0201
7041            && c1 > 0x20 && c2 < 0xE0)
7042     {
7043       XSETFASTINT (val, c1 | 0x80);
7044     }
7045   else
7046     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7047   return val;
7048 }
7049
7050 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7051        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7052 Return the corresponding character.  */)
7053      (code)
7054      Lisp_Object code;
7055 {
7056   int charset;
7057   unsigned char b1, b2, c1, c2;
7058   Lisp_Object val;
7059
7060   CHECK_NUMBER (code);
7061   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7062   if (b1 == 0)
7063     {
7064       if (b2 >= 0x80)
7065         error ("Invalid BIG5 code: %x", XFASTINT (code));
7066       val = code;
7067     }
7068   else
7069     {
7070       if ((b1 < 0xA1 || b1 > 0xFE)
7071           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7072         error ("Invalid BIG5 code: %x", XFASTINT (code));
7073       DECODE_BIG5 (b1, b2, charset, c1, c2);
7074       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7075     }
7076   return val;
7077 }
7078
7079 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7080        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7081 Return the corresponding character code in Big5.  */)
7082      (ch)
7083      Lisp_Object ch;
7084 {
7085   int charset, c1, c2, b1, b2;
7086   Lisp_Object val;
7087
7088   CHECK_NUMBER (ch);
7089   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7090   if (charset == CHARSET_ASCII)
7091     {
7092       val = ch;
7093     }
7094   else if ((charset == charset_big5_1
7095             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7096            || (charset == charset_big5_2
7097                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7098     {
7099       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7100       XSETFASTINT (val, (b1 << 8) | b2);
7101     }
7102   else
7103     error ("Can't encode to Big5: %d", XFASTINT (ch));
7104   return val;
7105 }
7106 \f
7107 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7108        Sset_terminal_coding_system_internal, 1, 1, 0,
7109        doc: /* Internal use only.  */)
7110      (coding_system)
7111      Lisp_Object coding_system;
7112 {
7113   CHECK_SYMBOL (coding_system);
7114   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7115   /* We had better not send unsafe characters to terminal.  */
7116   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7117   /* Character composition should be disabled.  */
7118   terminal_coding.composing = COMPOSITION_DISABLED;
7119   /* Error notification should be suppressed.  */
7120   terminal_coding.suppress_error = 1;
7121   terminal_coding.src_multibyte = 1;
7122   terminal_coding.dst_multibyte = 0;
7123   return Qnil;
7124 }
7125
7126 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7127        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7128        doc: /* Internal use only.  */)
7129      (coding_system)
7130      Lisp_Object coding_system;
7131 {
7132   CHECK_SYMBOL (coding_system);
7133   setup_coding_system (Fcheck_coding_system (coding_system),
7134                        &safe_terminal_coding);
7135   /* Character composition should be disabled.  */
7136   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7137   /* Error notification should be suppressed.  */
7138   terminal_coding.suppress_error = 1;
7139   safe_terminal_coding.src_multibyte = 1;
7140   safe_terminal_coding.dst_multibyte = 0;
7141   return Qnil;
7142 }
7143
7144 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7145        Sterminal_coding_system, 0, 0, 0,
7146        doc: /* Return coding system specified for terminal output.  */)
7147      ()
7148 {
7149   return terminal_coding.symbol;
7150 }
7151
7152 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7153        Sset_keyboard_coding_system_internal, 1, 1, 0,
7154        doc: /* Internal use only.  */)
7155      (coding_system)
7156      Lisp_Object coding_system;
7157 {
7158   CHECK_SYMBOL (coding_system);
7159   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7160   /* Character composition should be disabled.  */
7161   keyboard_coding.composing = COMPOSITION_DISABLED;
7162   return Qnil;
7163 }
7164
7165 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7166        Skeyboard_coding_system, 0, 0, 0,
7167        doc: /* Return coding system specified for decoding keyboard input.  */)
7168      ()
7169 {
7170   return keyboard_coding.symbol;
7171 }
7172
7173 \f
7174 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7175        Sfind_operation_coding_system,  1, MANY, 0,
7176        doc: /* Choose a coding system for an operation based on the target name.
7177 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7178 DECODING-SYSTEM is the coding system to use for decoding
7179 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7180 for encoding (in case OPERATION does encoding).
7181
7182 The first argument OPERATION specifies an I/O primitive:
7183   For file I/O, `insert-file-contents' or `write-region'.
7184   For process I/O, `call-process', `call-process-region', or `start-process'.
7185   For network I/O, `open-network-stream'.
7186
7187 The remaining arguments should be the same arguments that were passed
7188 to the primitive.  Depending on which primitive, one of those arguments
7189 is selected as the TARGET.  For example, if OPERATION does file I/O,
7190 whichever argument specifies the file name is TARGET.
7191
7192 TARGET has a meaning which depends on OPERATION:
7193   For file I/O, TARGET is a file name.
7194   For process I/O, TARGET is a process name.
7195   For network I/O, TARGET is a service name or a port number
7196
7197 This function looks up what specified for TARGET in,
7198 `file-coding-system-alist', `process-coding-system-alist',
7199 or `network-coding-system-alist' depending on OPERATION.
7200 They may specify a coding system, a cons of coding systems,
7201 or a function symbol to call.
7202 In the last case, we call the function with one argument,
7203 which is a list of all the arguments given to this function.
7204
7205 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7206      (nargs, args)
7207      int nargs;
7208      Lisp_Object *args;
7209 {
7210   Lisp_Object operation, target_idx, target, val;
7211   register Lisp_Object chain;
7212
7213   if (nargs < 2)
7214     error ("Too few arguments");
7215   operation = args[0];
7216   if (!SYMBOLP (operation)
7217       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7218     error ("Invalid first argument");
7219   if (nargs < 1 + XINT (target_idx))
7220     error ("Too few arguments for operation: %s",
7221            SDATA (SYMBOL_NAME (operation)));
7222   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7223      argument to write-region) is string, it must be treated as a
7224      target file name.  */
7225   if (EQ (operation, Qwrite_region)
7226       && nargs > 5
7227       && STRINGP (args[5]))
7228     target_idx = make_number (4);
7229   target = args[XINT (target_idx) + 1];
7230   if (!(STRINGP (target)
7231         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7232     error ("Invalid argument %d", XINT (target_idx) + 1);
7233
7234   chain = ((EQ (operation, Qinsert_file_contents)
7235             || EQ (operation, Qwrite_region))
7236            ? Vfile_coding_system_alist
7237            : (EQ (operation, Qopen_network_stream)
7238               ? Vnetwork_coding_system_alist
7239               : Vprocess_coding_system_alist));
7240   if (NILP (chain))
7241     return Qnil;
7242
7243   for (; CONSP (chain); chain = XCDR (chain))
7244     {
7245       Lisp_Object elt;
7246       elt = XCAR (chain);
7247
7248       if (CONSP (elt)
7249           && ((STRINGP (target)
7250                && STRINGP (XCAR (elt))
7251                && fast_string_match (XCAR (elt), target) >= 0)
7252               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7253         {
7254           val = XCDR (elt);
7255           /* Here, if VAL is both a valid coding system and a valid
7256              function symbol, we return VAL as a coding system.  */
7257           if (CONSP (val))
7258             return val;
7259           if (! SYMBOLP (val))
7260             return Qnil;
7261           if (! NILP (Fcoding_system_p (val)))
7262             return Fcons (val, val);
7263           if (! NILP (Ffboundp (val)))
7264             {
7265               val = call1 (val, Flist (nargs, args));
7266               if (CONSP (val))
7267                 return val;
7268               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7269                 return Fcons (val, val);
7270             }
7271           return Qnil;
7272         }
7273     }
7274   return Qnil;
7275 }
7276
7277 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7278        Supdate_coding_systems_internal, 0, 0, 0,
7279        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7280 When values of any coding categories are changed, you must
7281 call this function.  */)
7282      ()
7283 {
7284   int i;
7285
7286   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7287     {
7288       Lisp_Object val;
7289
7290       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7291       if (!NILP (val))
7292         {
7293           if (! coding_system_table[i])
7294             coding_system_table[i] = ((struct coding_system *)
7295                                       xmalloc (sizeof (struct coding_system)));
7296           setup_coding_system (val, coding_system_table[i]);
7297         }
7298       else if (coding_system_table[i])
7299         {
7300           xfree (coding_system_table[i]);
7301           coding_system_table[i] = NULL;
7302         }
7303     }
7304
7305   return Qnil;
7306 }
7307
7308 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7309        Sset_coding_priority_internal, 0, 0, 0,
7310        doc: /* Update internal database for the current value of `coding-category-list'.
7311 This function is internal use only.  */)
7312      ()
7313 {
7314   int i = 0, idx;
7315   Lisp_Object val;
7316
7317   val = Vcoding_category_list;
7318
7319   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7320     {
7321       if (! SYMBOLP (XCAR (val)))
7322         break;
7323       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7324       if (idx >= CODING_CATEGORY_IDX_MAX)
7325         break;
7326       coding_priorities[i++] = (1 << idx);
7327       val = XCDR (val);
7328     }
7329   /* If coding-category-list is valid and contains all coding
7330      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7331      the following code saves Emacs from crashing.  */
7332   while (i < CODING_CATEGORY_IDX_MAX)
7333     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7334
7335   return Qnil;
7336 }
7337
7338 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7339        Sdefine_coding_system_internal, 1, 1, 0,
7340        doc: /* Register CODING-SYSTEM as a base coding system.
7341 This function is internal use only.  */)
7342      (coding_system)
7343      Lisp_Object coding_system;
7344 {
7345   Lisp_Object safe_chars, slot;
7346
7347   if (NILP (Fcheck_coding_system (coding_system)))
7348     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7349   safe_chars = coding_safe_chars (coding_system);
7350   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7351     error ("No valid safe-chars property for %s",
7352            SDATA (SYMBOL_NAME (coding_system)));
7353   if (EQ (safe_chars, Qt))
7354     {
7355       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7356         XSETCAR (Vcoding_system_safe_chars,
7357                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7358     }
7359   else
7360     {
7361       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7362       if (NILP (slot))
7363         XSETCDR (Vcoding_system_safe_chars,
7364                  nconc2 (XCDR (Vcoding_system_safe_chars),
7365                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7366       else
7367         XSETCDR (slot, safe_chars);
7368     }
7369   return Qnil;
7370 }
7371
7372 #endif /* emacs */
7373
7374 \f
7375 /*** 9. Post-amble ***/
7376
7377 void
7378 init_coding_once ()
7379 {
7380   int i;
7381
7382   /* Emacs' internal format specific initialize routine.  */
7383   for (i = 0; i <= 0x20; i++)
7384     emacs_code_class[i] = EMACS_control_code;
7385   emacs_code_class[0x0A] = EMACS_linefeed_code;
7386   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7387   for (i = 0x21 ; i < 0x7F; i++)
7388     emacs_code_class[i] = EMACS_ascii_code;
7389   emacs_code_class[0x7F] = EMACS_control_code;
7390   for (i = 0x80; i < 0xFF; i++)
7391     emacs_code_class[i] = EMACS_invalid_code;
7392   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7393   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7394   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7395   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7396
7397   /* ISO2022 specific initialize routine.  */
7398   for (i = 0; i < 0x20; i++)
7399     iso_code_class[i] = ISO_control_0;
7400   for (i = 0x21; i < 0x7F; i++)
7401     iso_code_class[i] = ISO_graphic_plane_0;
7402   for (i = 0x80; i < 0xA0; i++)
7403     iso_code_class[i] = ISO_control_1;
7404   for (i = 0xA1; i < 0xFF; i++)
7405     iso_code_class[i] = ISO_graphic_plane_1;
7406   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7407   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7408   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7409   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7410   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7411   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7412   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7413   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7414   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7415   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7416
7417   setup_coding_system (Qnil, &keyboard_coding);
7418   setup_coding_system (Qnil, &terminal_coding);
7419   setup_coding_system (Qnil, &safe_terminal_coding);
7420   setup_coding_system (Qnil, &default_buffer_file_coding);
7421
7422   bzero (coding_system_table, sizeof coding_system_table);
7423
7424   bzero (ascii_skip_code, sizeof ascii_skip_code);
7425   for (i = 0; i < 128; i++)
7426     ascii_skip_code[i] = 1;
7427
7428 #if defined (MSDOS) || defined (WINDOWSNT)
7429   system_eol_type = CODING_EOL_CRLF;
7430 #else
7431   system_eol_type = CODING_EOL_LF;
7432 #endif
7433
7434   inhibit_pre_post_conversion = 0;
7435 }
7436
7437 #ifdef emacs
7438
7439 void
7440 syms_of_coding ()
7441 {
7442   Qtarget_idx = intern ("target-idx");
7443   staticpro (&Qtarget_idx);
7444
7445   Qcoding_system_history = intern ("coding-system-history");
7446   staticpro (&Qcoding_system_history);
7447   Fset (Qcoding_system_history, Qnil);
7448
7449   /* Target FILENAME is the first argument.  */
7450   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7451   /* Target FILENAME is the third argument.  */
7452   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7453
7454   Qcall_process = intern ("call-process");
7455   staticpro (&Qcall_process);
7456   /* Target PROGRAM is the first argument.  */
7457   Fput (Qcall_process, Qtarget_idx, make_number (0));
7458
7459   Qcall_process_region = intern ("call-process-region");
7460   staticpro (&Qcall_process_region);
7461   /* Target PROGRAM is the third argument.  */
7462   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7463
7464   Qstart_process = intern ("start-process");
7465   staticpro (&Qstart_process);
7466   /* Target PROGRAM is the third argument.  */
7467   Fput (Qstart_process, Qtarget_idx, make_number (2));
7468
7469   Qopen_network_stream = intern ("open-network-stream");
7470   staticpro (&Qopen_network_stream);
7471   /* Target SERVICE is the fourth argument.  */
7472   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7473
7474   Qcoding_system = intern ("coding-system");
7475   staticpro (&Qcoding_system);
7476
7477   Qeol_type = intern ("eol-type");
7478   staticpro (&Qeol_type);
7479
7480   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7481   staticpro (&Qbuffer_file_coding_system);
7482
7483   Qpost_read_conversion = intern ("post-read-conversion");
7484   staticpro (&Qpost_read_conversion);
7485
7486   Qpre_write_conversion = intern ("pre-write-conversion");
7487   staticpro (&Qpre_write_conversion);
7488
7489   Qno_conversion = intern ("no-conversion");
7490   staticpro (&Qno_conversion);
7491
7492   Qundecided = intern ("undecided");
7493   staticpro (&Qundecided);
7494
7495   Qcoding_system_p = intern ("coding-system-p");
7496   staticpro (&Qcoding_system_p);
7497
7498   Qcoding_system_error = intern ("coding-system-error");
7499   staticpro (&Qcoding_system_error);
7500
7501   Fput (Qcoding_system_error, Qerror_conditions,
7502         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7503   Fput (Qcoding_system_error, Qerror_message,
7504         build_string ("Invalid coding system"));
7505
7506   Qcoding_category = intern ("coding-category");
7507   staticpro (&Qcoding_category);
7508   Qcoding_category_index = intern ("coding-category-index");
7509   staticpro (&Qcoding_category_index);
7510
7511   Vcoding_category_table
7512     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7513   staticpro (&Vcoding_category_table);
7514   {
7515     int i;
7516     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7517       {
7518         XVECTOR (Vcoding_category_table)->contents[i]
7519           = intern (coding_category_name[i]);
7520         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7521               Qcoding_category_index, make_number (i));
7522       }
7523   }
7524
7525   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7526   staticpro (&Vcoding_system_safe_chars);
7527
7528   Qtranslation_table = intern ("translation-table");
7529   staticpro (&Qtranslation_table);
7530   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7531
7532   Qtranslation_table_id = intern ("translation-table-id");
7533   staticpro (&Qtranslation_table_id);
7534
7535   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7536   staticpro (&Qtranslation_table_for_decode);
7537
7538   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7539   staticpro (&Qtranslation_table_for_encode);
7540
7541   Qsafe_chars = intern ("safe-chars");
7542   staticpro (&Qsafe_chars);
7543
7544   Qchar_coding_system = intern ("char-coding-system");
7545   staticpro (&Qchar_coding_system);
7546
7547   /* Intern this now in case it isn't already done.
7548      Setting this variable twice is harmless.
7549      But don't staticpro it here--that is done in alloc.c.  */
7550   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7551   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7552   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7553
7554   Qvalid_codes = intern ("valid-codes");
7555   staticpro (&Qvalid_codes);
7556
7557   Qemacs_mule = intern ("emacs-mule");
7558   staticpro (&Qemacs_mule);
7559
7560   Qraw_text = intern ("raw-text");
7561   staticpro (&Qraw_text);
7562
7563   defsubr (&Scoding_system_p);
7564   defsubr (&Sread_coding_system);
7565   defsubr (&Sread_non_nil_coding_system);
7566   defsubr (&Scheck_coding_system);
7567   defsubr (&Sdetect_coding_region);
7568   defsubr (&Sdetect_coding_string);
7569   defsubr (&Sfind_coding_systems_region_internal);
7570   defsubr (&Sunencodable_char_position);
7571   defsubr (&Sdecode_coding_region);
7572   defsubr (&Sencode_coding_region);
7573   defsubr (&Sdecode_coding_string);
7574   defsubr (&Sencode_coding_string);
7575   defsubr (&Sdecode_sjis_char);
7576   defsubr (&Sencode_sjis_char);
7577   defsubr (&Sdecode_big5_char);
7578   defsubr (&Sencode_big5_char);
7579   defsubr (&Sset_terminal_coding_system_internal);
7580   defsubr (&Sset_safe_terminal_coding_system_internal);
7581   defsubr (&Sterminal_coding_system);
7582   defsubr (&Sset_keyboard_coding_system_internal);
7583   defsubr (&Skeyboard_coding_system);
7584   defsubr (&Sfind_operation_coding_system);
7585   defsubr (&Supdate_coding_systems_internal);
7586   defsubr (&Sset_coding_priority_internal);
7587   defsubr (&Sdefine_coding_system_internal);
7588
7589   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7590                doc: /* List of coding systems.
7591
7592 Do not alter the value of this variable manually.  This variable should be
7593 updated by the functions `make-coding-system' and
7594 `define-coding-system-alias'.  */);
7595   Vcoding_system_list = Qnil;
7596
7597   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7598                doc: /* Alist of coding system names.
7599 Each element is one element list of coding system name.
7600 This variable is given to `completing-read' as TABLE argument.
7601
7602 Do not alter the value of this variable manually.  This variable should be
7603 updated by the functions `make-coding-system' and
7604 `define-coding-system-alias'.  */);
7605   Vcoding_system_alist = Qnil;
7606
7607   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7608                doc: /* List of coding-categories (symbols) ordered by priority.
7609
7610 On detecting a coding system, Emacs tries code detection algorithms
7611 associated with each coding-category one by one in this order.  When
7612 one algorithm agrees with a byte sequence of source text, the coding
7613 system bound to the corresponding coding-category is selected.  */);
7614   {
7615     int i;
7616
7617     Vcoding_category_list = Qnil;
7618     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7619       Vcoding_category_list
7620         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7621                  Vcoding_category_list);
7622   }
7623
7624   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7625                doc: /* Specify the coding system for read operations.
7626 It is useful to bind this variable with `let', but do not set it globally.
7627 If the value is a coding system, it is used for decoding on read operation.
7628 If not, an appropriate element is used from one of the coding system alists:
7629 There are three such tables, `file-coding-system-alist',
7630 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7631   Vcoding_system_for_read = Qnil;
7632
7633   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7634                doc: /* Specify the coding system for write operations.
7635 Programs bind this variable with `let', but you should not set it globally.
7636 If the value is a coding system, it is used for encoding of output,
7637 when writing it to a file and when sending it to a file or subprocess.
7638
7639 If this does not specify a coding system, an appropriate element
7640 is used from one of the coding system alists:
7641 There are three such tables, `file-coding-system-alist',
7642 `process-coding-system-alist', and `network-coding-system-alist'.
7643 For output to files, if the above procedure does not specify a coding system,
7644 the value of `buffer-file-coding-system' is used.  */);
7645   Vcoding_system_for_write = Qnil;
7646
7647   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7648                doc: /* Coding system used in the latest file or process I/O.
7649 Also set by `encode-coding-region', `decode-coding-region',
7650 `encode-coding-string' and `decode-coding-string'.  */);
7651   Vlast_coding_system_used = Qnil;
7652
7653   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7654                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7655 See info node `Coding Systems' and info node `Text and Binary' concerning
7656 such conversion.  */);
7657   inhibit_eol_conversion = 0;
7658
7659   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7660                doc: /* Non-nil means process buffer inherits coding system of process output.
7661 Bind it to t if the process output is to be treated as if it were a file
7662 read from some filesystem.  */);
7663   inherit_process_coding_system = 0;
7664
7665   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7666                doc: /* Alist to decide a coding system to use for a file I/O operation.
7667 The format is ((PATTERN . VAL) ...),
7668 where PATTERN is a regular expression matching a file name,
7669 VAL is a coding system, a cons of coding systems, or a function symbol.
7670 If VAL is a coding system, it is used for both decoding and encoding
7671 the file contents.
7672 If VAL is a cons of coding systems, the car part is used for decoding,
7673 and the cdr part is used for encoding.
7674 If VAL is a function symbol, the function must return a coding system
7675 or a cons of coding systems which are used as above.  The function gets
7676 the arguments with which `find-operation-coding-system' was called.
7677
7678 See also the function `find-operation-coding-system'
7679 and the variable `auto-coding-alist'.  */);
7680   Vfile_coding_system_alist = Qnil;
7681
7682   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7683     doc: /* Alist to decide a coding system to use for a process I/O operation.
7684 The format is ((PATTERN . VAL) ...),
7685 where PATTERN is a regular expression matching a program name,
7686 VAL is a coding system, a cons of coding systems, or a function symbol.
7687 If VAL is a coding system, it is used for both decoding what received
7688 from the program and encoding what sent to the program.
7689 If VAL is a cons of coding systems, the car part is used for decoding,
7690 and the cdr part is used for encoding.
7691 If VAL is a function symbol, the function must return a coding system
7692 or a cons of coding systems which are used as above.
7693
7694 See also the function `find-operation-coding-system'.  */);
7695   Vprocess_coding_system_alist = Qnil;
7696
7697   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7698     doc: /* Alist to decide a coding system to use for a network I/O operation.
7699 The format is ((PATTERN . VAL) ...),
7700 where PATTERN is a regular expression matching a network service name
7701 or is a port number to connect to,
7702 VAL is a coding system, a cons of coding systems, or a function symbol.
7703 If VAL is a coding system, it is used for both decoding what received
7704 from the network stream and encoding what sent to the network stream.
7705 If VAL is a cons of coding systems, the car part is used for decoding,
7706 and the cdr part is used for encoding.
7707 If VAL is a function symbol, the function must return a coding system
7708 or a cons of coding systems which are used as above.
7709
7710 See also the function `find-operation-coding-system'.  */);
7711   Vnetwork_coding_system_alist = Qnil;
7712
7713   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7714                doc: /* Coding system to use with system messages.
7715 Also used for decoding keyboard input on X Window system.  */);
7716   Vlocale_coding_system = Qnil;
7717
7718   /* The eol mnemonics are reset in startup.el system-dependently.  */
7719   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7720                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7721   eol_mnemonic_unix = build_string (":");
7722
7723   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7724                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7725   eol_mnemonic_dos = build_string ("\\");
7726
7727   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7728                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7729   eol_mnemonic_mac = build_string ("/");
7730
7731   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7732                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7733   eol_mnemonic_undecided = build_string (":");
7734
7735   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7736                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7737   Venable_character_translation = Qt;
7738
7739   DEFVAR_LISP ("standard-translation-table-for-decode",
7740                &Vstandard_translation_table_for_decode,
7741                doc: /* Table for translating characters while decoding.  */);
7742   Vstandard_translation_table_for_decode = Qnil;
7743
7744   DEFVAR_LISP ("standard-translation-table-for-encode",
7745                &Vstandard_translation_table_for_encode,
7746                doc: /* Table for translating characters while encoding.  */);
7747   Vstandard_translation_table_for_encode = Qnil;
7748
7749   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7750                doc: /* Alist of charsets vs revision numbers.
7751 While encoding, if a charset (car part of an element) is found,
7752 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7753   Vcharset_revision_alist = Qnil;
7754
7755   DEFVAR_LISP ("default-process-coding-system",
7756                &Vdefault_process_coding_system,
7757                doc: /* Cons of coding systems used for process I/O by default.
7758 The car part is used for decoding a process output,
7759 the cdr part is used for encoding a text to be sent to a process.  */);
7760   Vdefault_process_coding_system = Qnil;
7761
7762   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7763                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7764 This is a vector of length 256.
7765 If Nth element is non-nil, the existence of code N in a file
7766 \(or output of subprocess) doesn't prevent it to be detected as
7767 a coding system of ISO 2022 variant which has a flag
7768 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7769 or reading output of a subprocess.
7770 Only 128th through 159th elements has a meaning.  */);
7771   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7772
7773   DEFVAR_LISP ("select-safe-coding-system-function",
7774                &Vselect_safe_coding_system_function,
7775                doc: /* Function to call to select safe coding system for encoding a text.
7776
7777 If set, this function is called to force a user to select a proper
7778 coding system which can encode the text in the case that a default
7779 coding system used in each operation can't encode the text.
7780
7781 The default value is `select-safe-coding-system' (which see).  */);
7782   Vselect_safe_coding_system_function = Qnil;
7783
7784   DEFVAR_BOOL ("coding-system-require-warning",
7785                &coding_system_require_warning,
7786                doc: /* Internal use only.
7787 If non-nil, on writing a file, `select-safe-coding-system-function' is
7788 called even if `coding-system-for-write' is non-nil.  The command
7789 `universal-coding-system-argument' binds this variable to t temporarily.  */);
7790   coding_system_require_warning = 0;
7791
7792
7793   DEFVAR_BOOL ("inhibit-iso-escape-detection",
7794                &inhibit_iso_escape_detection,
7795                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7796
7797 By default, on reading a file, Emacs tries to detect how the text is
7798 encoded.  This code detection is sensitive to escape sequences.  If
7799 the sequence is valid as ISO2022, the code is determined as one of
7800 the ISO2022 encodings, and the file is decoded by the corresponding
7801 coding system (e.g. `iso-2022-7bit').
7802
7803 However, there may be a case that you want to read escape sequences in
7804 a file as is.  In such a case, you can set this variable to non-nil.
7805 Then, as the code detection ignores any escape sequences, no file is
7806 detected as encoded in some ISO2022 encoding.  The result is that all
7807 escape sequences become visible in a buffer.
7808
7809 The default value is nil, and it is strongly recommended not to change
7810 it.  That is because many Emacs Lisp source files that contain
7811 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7812 in Emacs's distribution, and they won't be decoded correctly on
7813 reading if you suppress escape sequence detection.
7814
7815 The other way to read escape sequences in a file without decoding is
7816 to explicitly specify some coding system that doesn't use ISO2022's
7817 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
7818   inhibit_iso_escape_detection = 0;
7819
7820   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
7821                doc: /* Char table for translating self-inserting characters.
7822 This is applied to the result of input methods, not their input.  See also
7823 `keyboard-translate-table'.  */);
7824     Vtranslation_table_for_input = Qnil;
7825 }
7826
7827 char *
7828 emacs_strerror (error_number)
7829      int error_number;
7830 {
7831   char *str;
7832
7833   synchronize_system_messages_locale ();
7834   str = strerror (error_number);
7835
7836   if (! NILP (Vlocale_coding_system))
7837     {
7838       Lisp_Object dec = code_convert_string_norecord (build_string (str),
7839                                                       Vlocale_coding_system,
7840                                                       0);
7841       str = (char *) SDATA (dec);
7842     }
7843
7844   return str;
7845 }
7846
7847 #endif /* emacs */
7848