src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   0. General comments
  25   1. Preamble
  26   2. Emacs' internal format (emacs-mule) handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. CCL handlers
  30   6. End-of-line handlers
  31   7. C library functions
  32   8. Emacs Lisp library functions
  33   9. Post-amble
  34
  35 */
  36
  37 /*** 0. General comments ***/
  38
  39
  40 /*** GENERAL NOTE on CODING SYSTEM ***
  41
  42   Coding system is an encoding mechanism of one or more character
  43   sets.  Here's a list of coding systems which Emacs can handle.  When
  44   we say "decode", it means converting some other coding system to
  45   Emacs' internal format (emacs-internal), and when we say "encode",
  46   it means converting the coding system emacs-mule to some other
  47   coding system.
  48
  49   0. Emacs' internal format (emacs-mule)
  50
  51   Emacs itself holds a multi-lingual character in a buffer and a string
  52   in a special format.  Details are described in section 2.
  53
  54   1. ISO2022
  55
  56   The most famous coding system for multiple character sets.  X's
  57   Compound Text, various EUCs (Extended Unix Code), and coding
  58   systems used in Internet communication such as ISO-2022-JP are
  59   all variants of ISO2022.  Details are described in section 3.
  60
  61   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  62
  63   A coding system to encode character sets: ASCII, JISX0201, and
  64   JISX0208.  Widely used for PC's in Japan.  Details are described in
  65   section 4.
  66
  67   3. BIG5
  68
  69   A coding system to encode character sets: ASCII and Big5.  Widely
  70   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  71   described in section 4.  In this file, when we write "BIG5"
  72   (all uppercase), we mean the coding system, and when we write
  73   "Big5" (capitalized), we mean the character set.
  74
  75   4. Raw text
  76
  77   A coding system for a text containing random 8-bit code.  Emacs does
  78   no code conversion on such a text except for end-of-line format.
  79
  80   5. Other
  81
  82   If a user wants to read/write a text encoded in a coding system not
  83   listed above, he can supply a decoder and an encoder for it in CCL
  84   (Code Conversion Language) programs.  Emacs executes the CCL program
  85   while reading/writing.
  86
  87   Emacs represents a coding system by a Lisp symbol that has a property
  88   `coding-system'.  But, before actually using the coding system, the
  89   information about it is set in a structure of type `struct
  90   coding_system' for rapid processing.  See section 6 for more details.
  91
  92 */
  93
  94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  95
  96   How end-of-line of a text is encoded depends on a system.  For
  97   instance, Unix's format is just one byte of `line-feed' code,
  98   whereas DOS's format is two-byte sequence of `carriage-return' and
  99   `line-feed' codes.  MacOS's format is usually one byte of
 100   `carriage-return'.
 101
 102   Since text characters encoding and end-of-line encoding are
 103   independent, any coding system described above can take
 104   any format of end-of-line.  So, Emacs has information of format of
 105   end-of-line in each coding-system.  See section 6 for more details.
 106
 107 */
 108
 109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 110
 111   These functions check if a text between SRC and SRC_END is encoded
 112   in the coding system category XXX.  Each returns an integer value in
 113   which appropriate flag bits for the category XXX is set.  The flag
 114   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 115   template of these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 116   of the range 0x80..0x9F are in multibyte form.  */
 117 #if 0
 118 int
 119 detect_coding_emacs_mule (src, src_end, multibytep)
 120      unsigned char *src, *src_end;
 121      int multibytep;
 122 {
 123   ...
 124 }
 125 #endif
 126
 127 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 128
 129   These functions decode SRC_BYTES length of unibyte text at SOURCE
 130   encoded in CODING to Emacs' internal format.  The resulting
 131   multibyte text goes to a place pointed to by DESTINATION, the length
 132   of which should not exceed DST_BYTES.
 133
 134   These functions set the information of original and decoded texts in
 135   the members produced, produced_char, consumed, and consumed_char of
 136   the structure *CODING.  They also set the member result to one of
 137   CODING_FINISH_XXX indicating how the decoding finished.
 138
 139   DST_BYTES zero means that source area and destination area are
 140   overlapped, which means that we can produce a decoded text until it
 141   reaches at the head of not-yet-decoded source text.
 142
 143   Below is a template of these functions.  */
 144 #if 0
 145 static void
 146 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 147      struct coding_system *coding;
 148      unsigned char *source, *destination;
 149      int src_bytes, dst_bytes;
 150 {
 151   ...
 152 }
 153 #endif
 154
 155 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 156
 157   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 158   internal multibyte format to CODING.  The resulting unibyte text
 159   goes to a place pointed to by DESTINATION, the length of which
 160   should not exceed DST_BYTES.
 161
 162   These functions set the information of original and encoded texts in
 163   the members produced, produced_char, consumed, and consumed_char of
 164   the structure *CODING.  They also set the member result to one of
 165   CODING_FINISH_XXX indicating how the encoding finished.
 166
 167   DST_BYTES zero means that source area and destination area are
 168   overlapped, which means that we can produce a encoded text until it
 169   reaches at the head of not-yet-encoded source text.
 170
 171   Below is a template of these functions.  */
 172 #if 0
 173 static void
 174 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 175      struct coding_system *coding;
 176      unsigned char *source, *destination;
 177      int src_bytes, dst_bytes;
 178 {
 179   ...
 180 }
 181 #endif
 182
 183 /*** COMMONLY USED MACROS ***/
 184
 185 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 186    get one, two, and three bytes from the source text respectively.
 187    If there are not enough bytes in the source, they jump to
 188    `label_end_of_loop'.  The caller should set variables `coding',
 189    `src' and `src_end' to appropriate pointer in advance.  These
 190    macros are called from decoding routines `decode_coding_XXX', thus
 191    it is assumed that the source text is unibyte.  */
 192
 193 #define ONE_MORE_BYTE(c1)                                       \
 194   do {                                                          \
 195     if (src >= src_end)                                         \
 196       {                                                         \
 197         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 198         goto label_end_of_loop;                                 \
 199       }                                                         \
 200     c1 = *src++;                                                \
 201   } while (0)
 202
 203 #define TWO_MORE_BYTES(c1, c2)                                  \
 204   do {                                                          \
 205     if (src + 1 >= src_end)                                     \
 206       {                                                         \
 207         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 208         goto label_end_of_loop;                                 \
 209       }                                                         \
 210     c1 = *src++;                                                \
 211     c2 = *src++;                                                \
 212   } while (0)
 213
 214
 215 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 216    form if MULTIBYTEP is nonzero.  */
 217
 218 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 219   do {                                                          \
 220     if (src >= src_end)                                         \
 221       {                                                         \
 222         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 223         goto label_end_of_loop;                                 \
 224       }                                                         \
 225     c1 = *src++;                                                \
 226     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 227       c1 = *src++ - 0x20;                                       \
 228   } while (0)
 229
 230 /* Set C to the next character at the source text pointed by `src'.
 231    If there are not enough characters in the source, jump to
 232    `label_end_of_loop'.  The caller should set variables `coding'
 233    `src', `src_end', and `translation_table' to appropriate pointers
 234    in advance.  This macro is used in encoding routines
 235    `encode_coding_XXX', thus it assumes that the source text is in
 236    multibyte form except for 8-bit characters.  8-bit characters are
 237    in multibyte form if coding->src_multibyte is nonzero, else they
 238    are represented by a single byte.  */
 239
 240 #define ONE_MORE_CHAR(c)                                        \
 241   do {                                                          \
 242     int len = src_end - src;                                    \
 243     int bytes;                                                  \
 244     if (len <= 0)                                               \
 245       {                                                         \
 246         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 247         goto label_end_of_loop;                                 \
 248       }                                                         \
 249     if (coding->src_multibyte                                   \
 250         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 251       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 252     else                                                        \
 253       c = *src, bytes = 1;                                      \
 254     if (!NILP (translation_table))                              \
 255       c = translate_char (translation_table, c, -1, 0, 0);      \
 256     src += bytes;                                               \
 257   } while (0)
 258
 259
 260 /* Produce a multibyte form of characater C to `dst'.  Jump to
 261    `label_end_of_loop' if there's not enough space at `dst'.
 262
 263    If we are now in the middle of composition sequence, the decoded
 264    character may be ALTCHAR (for the current composition).  In that
 265    case, the character goes to coding->cmp_data->data instead of
 266    `dst'.
 267
 268    This macro is used in decoding routines.  */
 269
 270 #define EMIT_CHAR(c)                                                    \
 271   do {                                                                  \
 272     if (! COMPOSING_P (coding)                                          \
 273         || coding->composing == COMPOSITION_RELATIVE                    \
 274         || coding->composing == COMPOSITION_WITH_RULE)                  \
 275       {                                                                 \
 276         int bytes = CHAR_BYTES (c);                                     \
 277         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 278           {                                                             \
 279             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 280             goto label_end_of_loop;                                     \
 281           }                                                             \
 282         dst += CHAR_STRING (c, dst);                                    \
 283         coding->produced_char++;                                        \
 284       }                                                                 \
 285                                                                         \
 286     if (COMPOSING_P (coding)                                            \
 287         && coding->composing != COMPOSITION_RELATIVE)                   \
 288       {                                                                 \
 289         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 290         coding->composition_rule_follows                                \
 291           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 292       }                                                                 \
 293   } while (0)
 294
 295
 296 #define EMIT_ONE_BYTE(c)                                        \
 297   do {                                                          \
 298     if (dst >= (dst_bytes ? dst_end : src))                     \
 299       {                                                         \
 300         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 301         goto label_end_of_loop;                                 \
 302       }                                                         \
 303     *dst++ = c;                                                 \
 304   } while (0)
 305
 306 #define EMIT_TWO_BYTES(c1, c2)                                  \
 307   do {                                                          \
 308     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 309       {                                                         \
 310         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 311         goto label_end_of_loop;                                 \
 312       }                                                         \
 313     *dst++ = c1, *dst++ = c2;                                   \
 314   } while (0)
 315
 316 #define EMIT_BYTES(from, to)                                    \
 317   do {                                                          \
 318     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 319       {                                                         \
 320         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 321         goto label_end_of_loop;                                 \
 322       }                                                         \
 323     while (from < to)                                           \
 324       *dst++ = *from++;                                         \
 325   } while (0)
 326
 327 \f
 328 /*** 1. Preamble ***/
 329
 330 #ifdef emacs
 331 #include <config.h>
 332 #endif
 333
 334 #include <stdio.h>
 335
 336 #ifdef emacs
 337
 338 #include "lisp.h"
 339 #include "buffer.h"
 340 #include "charset.h"
 341 #include "composite.h"
 342 #include "ccl.h"
 343 #include "coding.h"
 344 #include "window.h"
 345
 346 #else  /* not emacs */
 347
 348 #include "mulelib.h"
 349
 350 #endif /* not emacs */
 351
 352 Lisp_Object Qcoding_system, Qeol_type;
 353 Lisp_Object Qbuffer_file_coding_system;
 354 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 355 Lisp_Object Qno_conversion, Qundecided;
 356 Lisp_Object Qcoding_system_history;
 357 Lisp_Object Qsafe_chars;
 358 Lisp_Object Qvalid_codes;
 359
 360 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 361 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 362 Lisp_Object Qstart_process, Qopen_network_stream;
 363 Lisp_Object Qtarget_idx;
 364
 365 Lisp_Object Vselect_safe_coding_system_function;
 366
 367 /* Mnemonic string for each format of end-of-line.  */
 368 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 369 /* Mnemonic string to indicate format of end-of-line is not yet
 370    decided.  */
 371 Lisp_Object eol_mnemonic_undecided;
 372
 373 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 374    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 375 int system_eol_type;
 376
 377 #ifdef emacs
 378
 379 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 380
 381 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 382
 383 /* Coding system emacs-mule and raw-text are for converting only
 384    end-of-line format.  */
 385 Lisp_Object Qemacs_mule, Qraw_text;
 386
 387 /* Coding-systems are handed between Emacs Lisp programs and C internal
 388    routines by the following three variables.  */
 389 /* Coding-system for reading files and receiving data from process.  */
 390 Lisp_Object Vcoding_system_for_read;
 391 /* Coding-system for writing files and sending data to process.  */
 392 Lisp_Object Vcoding_system_for_write;
 393 /* Coding-system actually used in the latest I/O.  */
 394 Lisp_Object Vlast_coding_system_used;
 395
 396 /* A vector of length 256 which contains information about special
 397    Latin codes (especially for dealing with Microsoft codes).  */
 398 Lisp_Object Vlatin_extra_code_table;
 399
 400 /* Flag to inhibit code conversion of end-of-line format.  */
 401 int inhibit_eol_conversion;
 402
 403 /* Flag to inhibit ISO2022 escape sequence detection.  */
 404 int inhibit_iso_escape_detection;
 405
 406 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 407 int inherit_process_coding_system;
 408
 409 /* Coding system to be used to encode text for terminal display.  */
 410 struct coding_system terminal_coding;
 411
 412 /* Coding system to be used to encode text for terminal display when
 413    terminal coding system is nil.  */
 414 struct coding_system safe_terminal_coding;
 415
 416 /* Coding system of what is sent from terminal keyboard.  */
 417 struct coding_system keyboard_coding;
 418
 419 /* Default coding system to be used to write a file.  */
 420 struct coding_system default_buffer_file_coding;
 421
 422 Lisp_Object Vfile_coding_system_alist;
 423 Lisp_Object Vprocess_coding_system_alist;
 424 Lisp_Object Vnetwork_coding_system_alist;
 425
 426 Lisp_Object Vlocale_coding_system;
 427
 428 #endif /* emacs */
 429
 430 Lisp_Object Qcoding_category, Qcoding_category_index;
 431
 432 /* List of symbols `coding-category-xxx' ordered by priority.  */
 433 Lisp_Object Vcoding_category_list;
 434
 435 /* Table of coding categories (Lisp symbols).  */
 436 Lisp_Object Vcoding_category_table;
 437
 438 /* Table of names of symbol for each coding-category.  */
 439 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 440   "coding-category-emacs-mule",
 441   "coding-category-sjis",
 442   "coding-category-iso-7",
 443   "coding-category-iso-7-tight",
 444   "coding-category-iso-8-1",
 445   "coding-category-iso-8-2",
 446   "coding-category-iso-7-else",
 447   "coding-category-iso-8-else",
 448   "coding-category-ccl",
 449   "coding-category-big5",
 450   "coding-category-utf-8",
 451   "coding-category-utf-16-be",
 452   "coding-category-utf-16-le",
 453   "coding-category-raw-text",
 454   "coding-category-binary"
 455 };
 456
 457 /* Table of pointers to coding systems corresponding to each coding
 458    categories.  */
 459 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 460
 461 /* Table of coding category masks.  Nth element is a mask for a coding
 462    cateogry of which priority is Nth.  */
 463 static
 464 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 465
 466 /* Flag to tell if we look up translation table on character code
 467    conversion.  */
 468 Lisp_Object Venable_character_translation;
 469 /* Standard translation table to look up on decoding (reading).  */
 470 Lisp_Object Vstandard_translation_table_for_decode;
 471 /* Standard translation table to look up on encoding (writing).  */
 472 Lisp_Object Vstandard_translation_table_for_encode;
 473
 474 Lisp_Object Qtranslation_table;
 475 Lisp_Object Qtranslation_table_id;
 476 Lisp_Object Qtranslation_table_for_decode;
 477 Lisp_Object Qtranslation_table_for_encode;
 478
 479 /* Alist of charsets vs revision number.  */
 480 Lisp_Object Vcharset_revision_alist;
 481
 482 /* Default coding systems used for process I/O.  */
 483 Lisp_Object Vdefault_process_coding_system;
 484
 485 /* Global flag to tell that we can't call post-read-conversion and
 486    pre-write-conversion functions.  Usually the value is zero, but it
 487    is set to 1 temporarily while such functions are running.  This is
 488    to avoid infinite recursive call.  */
 489 static int inhibit_pre_post_conversion;
 490
 491 /* Char-table containing safe coding systems of each character.  */
 492 Lisp_Object Vchar_coding_system_table;
 493 Lisp_Object Qchar_coding_system;
 494
 495 /* Return `safe-chars' property of coding system CODING.  Don't check
 496    validity of CODING.  */
 497
 498 Lisp_Object
 499 coding_safe_chars (coding)
 500      struct coding_system *coding;
 501 {
 502   Lisp_Object coding_spec, plist, safe_chars;
 503
 504   coding_spec = Fget (coding->symbol, Qcoding_system);
 505   plist = XVECTOR (coding_spec)->contents[3];
 506   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 507   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 508 }
 509
 510 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 511   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 512
 513 \f
 514 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 515
 516 /* Emacs' internal format for encoding multiple character sets is a
 517    kind of multi-byte encoding, i.e. characters are encoded by
 518    variable-length sequences of one-byte codes.
 519
 520    ASCII characters and control characters (e.g. `tab', `newline') are
 521    represented by one-byte sequences which are their ASCII codes, in
 522    the range 0x00 through 0x7F.
 523
 524    8-bit characters of the range 0x80..0x9F are represented by
 525    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 526    code + 0x20).
 527
 528    8-bit characters of the range 0xA0..0xFF are represented by
 529    one-byte sequences which are their 8-bit code.
 530
 531    The other characters are represented by a sequence of `base
 532    leading-code', optional `extended leading-code', and one or two
 533    `position-code's.  The length of the sequence is determined by the
 534    base leading-code.  Leading-code takes the range 0x80 through 0x9F,
 535    whereas extended leading-code and position-code take the range 0xA0
 536    through 0xFF.  See `charset.h' for more details about leading-code
 537    and position-code.
 538
 539    --- CODE RANGE of Emacs' internal format ---
 540    character set        range
 541    -------------        -----
 542    ascii                0x00..0x7F
 543    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 544    eight-bit-graphic    0xA0..0xBF
 545    ELSE                 0x81..0x9F + [0xA0..0xFF]+
 546    ---------------------------------------------
 547
 548   */
 549
 550 enum emacs_code_class_type emacs_code_class[256];
 551
 552 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 553    Check if a text is encoded in Emacs' internal format.  If it is,
 554    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 555
 556 static int
 557 detect_coding_emacs_mule (src, src_end, multibytep)
 558       unsigned char *src, *src_end;
 559       int multibytep;
 560 {
 561   unsigned char c;
 562   int composing = 0;
 563   /* Dummy for ONE_MORE_BYTE.  */
 564   struct coding_system dummy_coding;
 565   struct coding_system *coding = &dummy_coding;
 566
 567   while (1)
 568     {
 569       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 570
 571       if (composing)
 572         {
 573           if (c < 0xA0)
 574             composing = 0;
 575           else if (c == 0xA0)
 576             {
 577               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 578               c &= 0x7F;
 579             }
 580           else
 581             c -= 0x20;
 582         }
 583
 584       if (c < 0x20)
 585         {
 586           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 587             return 0;
 588         }
 589       else if (c >= 0x80 && c < 0xA0)
 590         {
 591           if (c == 0x80)
 592             /* Old leading code for a composite character.  */
 593             composing = 1;
 594           else
 595             {
 596               unsigned char *src_base = src - 1;
 597               int bytes;
 598
 599               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 600                                                bytes))
 601                 return 0;
 602               src = src_base + bytes;
 603             }
 604         }
 605     }
 606  label_end_of_loop:
 607   return CODING_CATEGORY_MASK_EMACS_MULE;
 608 }
 609
 610
 611 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 612
 613 static void
 614 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 615      struct coding_system *coding;
 616      unsigned char *source, *destination;
 617      int src_bytes, dst_bytes;
 618 {
 619   unsigned char *src = source;
 620   unsigned char *src_end = source + src_bytes;
 621   unsigned char *dst = destination;
 622   unsigned char *dst_end = destination + dst_bytes;
 623   /* SRC_BASE remembers the start position in source in each loop.
 624      The loop will be exited when there's not enough source code, or
 625      when there's not enough destination area to produce a
 626      character.  */
 627   unsigned char *src_base;
 628
 629   coding->produced_char = 0;
 630   while ((src_base = src) < src_end)
 631     {
 632       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 633       int bytes;
 634
 635       if (*src == '\r')
 636         {
 637           int c = *src++;
 638
 639           if (coding->eol_type == CODING_EOL_CR)
 640             c = '\n';
 641           else if (coding->eol_type == CODING_EOL_CRLF)
 642             {
 643               ONE_MORE_BYTE (c);
 644               if (c != '\n')
 645                 {
 646                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 647                     {
 648                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
 649                       goto label_end_of_loop;
 650                     }
 651                   src--;
 652                   c = '\r';
 653                 }
 654             }
 655           *dst++ = c;
 656           coding->produced_char++;
 657           continue;
 658         }
 659       else if (*src == '\n')
 660         {
 661           if ((coding->eol_type == CODING_EOL_CR
 662                || coding->eol_type == CODING_EOL_CRLF)
 663               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 664             {
 665               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 666               goto label_end_of_loop;
 667             }
 668           *dst++ = *src++;
 669           coding->produced_char++;
 670           continue;
 671         }
 672       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 673         {
 674           p = src;
 675           src += bytes;
 676         }
 677       else
 678         {
 679           bytes = CHAR_STRING (*src, tmp);
 680           p = tmp;
 681           src++;
 682         }
 683       if (dst + bytes >= (dst_bytes ? dst_end : src))
 684         {
 685           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 686           break;
 687         }
 688       while (bytes--) *dst++ = *p++;
 689       coding->produced_char++;
 690     }
 691  label_end_of_loop:
 692   coding->consumed = coding->consumed_char = src_base - source;
 693   coding->produced = dst - destination;
 694 }
 695
 696 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
 697   encode_eol (coding, source, destination, src_bytes, dst_bytes)
 698
 699
 700 \f
 701 /*** 3. ISO2022 handlers ***/
 702
 703 /* The following note describes the coding system ISO2022 briefly.
 704    Since the intention of this note is to help understand the
 705    functions in this file, some parts are NOT ACCURATE or OVERLY
 706    SIMPLIFIED.  For thorough understanding, please refer to the
 707    original document of ISO2022.
 708
 709    ISO2022 provides many mechanisms to encode several character sets
 710    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 711    is encoded using bytes less than 128.  This may make the encoded
 712    text a little bit longer, but the text passes more easily through
 713    several gateways, some of which strip off MSB (Most Signigant Bit).
 714
 715    There are two kinds of character sets: control character set and
 716    graphic character set.  The former contains control characters such
 717    as `newline' and `escape' to provide control functions (control
 718    functions are also provided by escape sequences).  The latter
 719    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 720    two control character sets and many graphic character sets.
 721
 722    Graphic character sets are classified into one of the following
 723    four classes, according to the number of bytes (DIMENSION) and
 724    number of characters in one dimension (CHARS) of the set:
 725    - DIMENSION1_CHARS94
 726    - DIMENSION1_CHARS96
 727    - DIMENSION2_CHARS94
 728    - DIMENSION2_CHARS96
 729
 730    In addition, each character set is assigned an identification tag,
 731    unique for each set, called "final character" (denoted as <F>
 732    hereafter).  The <F> of each character set is decided by ECMA(*)
 733    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 734    (0x30..0x3F are for private use only).
 735
 736    Note (*): ECMA = European Computer Manufacturers Association
 737
 738    Here are examples of graphic character set [NAME(<F>)]:
 739         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 740         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 741         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 742         o DIMENSION2_CHARS96 -- none for the moment
 743
 744    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 745         C0 [0x00..0x1F] -- control character plane 0
 746         GL [0x20..0x7F] -- graphic character plane 0
 747         C1 [0x80..0x9F] -- control character plane 1
 748         GR [0xA0..0xFF] -- graphic character plane 1
 749
 750    A control character set is directly designated and invoked to C0 or
 751    C1 by an escape sequence.  The most common case is that:
 752    - ISO646's  control character set is designated/invoked to C0, and
 753    - ISO6429's control character set is designated/invoked to C1,
 754    and usually these designations/invocations are omitted in encoded
 755    text.  In a 7-bit environment, only C0 can be used, and a control
 756    character for C1 is encoded by an appropriate escape sequence to
 757    fit into the environment.  All control characters for C1 are
 758    defined to have corresponding escape sequences.
 759
 760    A graphic character set is at first designated to one of four
 761    graphic registers (G0 through G3), then these graphic registers are
 762    invoked to GL or GR.  These designations and invocations can be
 763    done independently.  The most common case is that G0 is invoked to
 764    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 765    these invocations and designations are omitted in encoded text.
 766    In a 7-bit environment, only GL can be used.
 767
 768    When a graphic character set of CHARS94 is invoked to GL, codes
 769    0x20 and 0x7F of the GL area work as control characters SPACE and
 770    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 771    be used.
 772
 773    There are two ways of invocation: locking-shift and single-shift.
 774    With locking-shift, the invocation lasts until the next different
 775    invocation, whereas with single-shift, the invocation affects the
 776    following character only and doesn't affect the locking-shift
 777    state.  Invocations are done by the following control characters or
 778    escape sequences:
 779
 780    ----------------------------------------------------------------------
 781    abbrev  function                  cntrl escape seq   description
 782    ----------------------------------------------------------------------
 783    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 784    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 785    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 786    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 787    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 788    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 789    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 790    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 791    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 792    ----------------------------------------------------------------------
 793    (*) These are not used by any known coding system.
 794
 795    Control characters for these functions are defined by macros
 796    ISO_CODE_XXX in `coding.h'.
 797
 798    Designations are done by the following escape sequences:
 799    ----------------------------------------------------------------------
 800    escape sequence      description
 801    ----------------------------------------------------------------------
 802    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 803    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 804    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 805    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 806    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 807    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 808    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 809    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 810    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 811    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 812    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 813    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 814    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 815    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 816    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 817    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 818    ----------------------------------------------------------------------
 819
 820    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 821    of dimension 1, chars 94, and final character <F>, etc...
 822
 823    Note (*): Although these designations are not allowed in ISO2022,
 824    Emacs accepts them on decoding, and produces them on encoding
 825    CHARS96 character sets in a coding system which is characterized as
 826    7-bit environment, non-locking-shift, and non-single-shift.
 827
 828    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 829    '(' can be omitted.  We refer to this as "short-form" hereafter.
 830
 831    Now you may notice that there are a lot of ways for encoding the
 832    same multilingual text in ISO2022.  Actually, there exist many
 833    coding systems such as Compound Text (used in X11's inter client
 834    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 835    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 836    localized platforms), and all of these are variants of ISO2022.
 837
 838    In addition to the above, Emacs handles two more kinds of escape
 839    sequences: ISO6429's direction specification and Emacs' private
 840    sequence for specifying character composition.
 841
 842    ISO6429's direction specification takes the following form:
 843         o CSI ']'      -- end of the current direction
 844         o CSI '0' ']'  -- end of the current direction
 845         o CSI '1' ']'  -- start of left-to-right text
 846         o CSI '2' ']'  -- start of right-to-left text
 847    The control character CSI (0x9B: control sequence introducer) is
 848    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 849
 850    Character composition specification takes the following form:
 851         o ESC '0' -- start relative composition
 852         o ESC '1' -- end composition
 853         o ESC '2' -- start rule-base composition (*)
 854         o ESC '3' -- start relative composition with alternate chars  (**)
 855         o ESC '4' -- start rule-base composition with alternate chars  (**)
 856   Since these are not standard escape sequences of any ISO standard,
 857   the use of them for these meaning is restricted to Emacs only.
 858
 859   (*) This form is used only in Emacs 20.5 and the older versions,
 860   but the newer versions can safely decode it.
 861   (**) This form is used only in Emacs 21.1 and the newer versions,
 862   and the older versions can't decode it.
 863
 864   Here's a list of examples usages of these composition escape
 865   sequences (categorized by `enum composition_method').
 866
 867   COMPOSITION_RELATIVE:
 868         ESC 0 CHAR [ CHAR ] ESC 1
 869   COMPOSITOIN_WITH_RULE:
 870         ESC 2 CHAR [ RULE CHAR ] ESC 1
 871   COMPOSITION_WITH_ALTCHARS:
 872         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 873   COMPOSITION_WITH_RULE_ALTCHARS:
 874         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 875
 876 enum iso_code_class_type iso_code_class[256];
 877
 878 #define CHARSET_OK(idx, charset, c)                                     \
 879   (coding_system_table[idx]                                             \
 880    && (charset == CHARSET_ASCII                                         \
 881        || (safe_chars = coding_safe_chars (coding_system_table[idx]),   \
 882            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
 883    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
 884                                               charset)                  \
 885        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
 886
 887 #define SHIFT_OUT_OK(idx) \
 888   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 889
 890 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 891    Check if a text is encoded in ISO2022.  If it is, returns an
 892    integer in which appropriate flag bits any of:
 893         CODING_CATEGORY_MASK_ISO_7
 894         CODING_CATEGORY_MASK_ISO_7_TIGHT
 895         CODING_CATEGORY_MASK_ISO_8_1
 896         CODING_CATEGORY_MASK_ISO_8_2
 897         CODING_CATEGORY_MASK_ISO_7_ELSE
 898         CODING_CATEGORY_MASK_ISO_8_ELSE
 899    are set.  If a code which should never appear in ISO2022 is found,
 900    returns 0.  */
 901
 902 static int
 903 detect_coding_iso2022 (src, src_end, multibytep)
 904      unsigned char *src, *src_end;
 905      int multibytep;
 906 {
 907   int mask = CODING_CATEGORY_MASK_ISO;
 908   int mask_found = 0;
 909   int reg[4], shift_out = 0, single_shifting = 0;
 910   int c, c1, i, charset;
 911   /* Dummy for ONE_MORE_BYTE.  */
 912   struct coding_system dummy_coding;
 913   struct coding_system *coding = &dummy_coding;
 914   Lisp_Object safe_chars;
 915
 916   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 917   while (mask && src < src_end)
 918     {
 919       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 920       switch (c)
 921         {
 922         case ISO_CODE_ESC:
 923           if (inhibit_iso_escape_detection)
 924             break;
 925           single_shifting = 0;
 926           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 927           if (c >= '(' && c <= '/')
 928             {
 929               /* Designation sequence for a charset of dimension 1.  */
 930               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
 931               if (c1 < ' ' || c1 >= 0x80
 932                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 933                 /* Invalid designation sequence.  Just ignore.  */
 934                 break;
 935               reg[(c - '(') % 4] = charset;
 936             }
 937           else if (c == '$')
 938             {
 939               /* Designation sequence for a charset of dimension 2.  */
 940               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 941               if (c >= '@' && c <= 'B')
 942                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 943                 reg[0] = charset = iso_charset_table[1][0][c];
 944               else if (c >= '(' && c <= '/')
 945                 {
 946                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
 947                   if (c1 < ' ' || c1 >= 0x80
 948                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 949                     /* Invalid designation sequence.  Just ignore.  */
 950                     break;
 951                   reg[(c - '(') % 4] = charset;
 952                 }
 953               else
 954                 /* Invalid designation sequence.  Just ignore.  */
 955                 break;
 956             }
 957           else if (c == 'N' || c == 'O')
 958             {
 959               /* ESC <Fe> for SS2 or SS3.  */
 960               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 961               break;
 962             }
 963           else if (c >= '0' && c <= '4')
 964             {
 965               /* ESC <Fp> for start/end composition.  */
 966               mask_found |= CODING_CATEGORY_MASK_ISO;
 967               break;
 968             }
 969           else
 970             /* Invalid escape sequence.  Just ignore.  */
 971             break;
 972
 973           /* We found a valid designation sequence for CHARSET.  */
 974           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 975           c = MAKE_CHAR (charset, 0, 0);
 976           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
 977             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 978           else
 979             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 980           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
 981             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 982           else
 983             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 984           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
 985             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 986           else
 987             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 988           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
 989             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 990           else
 991             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 992           break;
 993
 994         case ISO_CODE_SO:
 995           if (inhibit_iso_escape_detection)
 996             break;
 997           single_shifting = 0;
 998           if (shift_out == 0
 999               && (reg[1] >= 0
1000                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1001                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1002             {
1003               /* Locking shift out.  */
1004               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1005               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1006             }
1007           break;
1008
1009         case ISO_CODE_SI:
1010           if (inhibit_iso_escape_detection)
1011             break;
1012           single_shifting = 0;
1013           if (shift_out == 1)
1014             {
1015               /* Locking shift in.  */
1016               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1017               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1018             }
1019           break;
1020
1021         case ISO_CODE_CSI:
1022           single_shifting = 0;
1023         case ISO_CODE_SS2:
1024         case ISO_CODE_SS3:
1025           {
1026             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1027
1028             if (inhibit_iso_escape_detection)
1029               break;
1030             if (c != ISO_CODE_CSI)
1031               {
1032                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1033                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1034                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1035                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1036                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1037                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1038                 single_shifting = 1;
1039               }
1040             if (VECTORP (Vlatin_extra_code_table)
1041                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1042               {
1043                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1044                     & CODING_FLAG_ISO_LATIN_EXTRA)
1045                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1046                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1047                     & CODING_FLAG_ISO_LATIN_EXTRA)
1048                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1049               }
1050             mask &= newmask;
1051             mask_found |= newmask;
1052           }
1053           break;
1054
1055         default:
1056           if (c < 0x80)
1057             {
1058               single_shifting = 0;
1059               break;
1060             }
1061           else if (c < 0xA0)
1062             {
1063               single_shifting = 0;
1064               if (VECTORP (Vlatin_extra_code_table)
1065                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1066                 {
1067                   int newmask = 0;
1068
1069                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1070                       & CODING_FLAG_ISO_LATIN_EXTRA)
1071                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1072                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1073                       & CODING_FLAG_ISO_LATIN_EXTRA)
1074                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1075                   mask &= newmask;
1076                   mask_found |= newmask;
1077                 }
1078               else
1079                 return 0;
1080             }
1081           else
1082             {
1083               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1084                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1085               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1086               /* Check the length of succeeding codes of the range
1087                  0xA0..0FF.  If the byte length is odd, we exclude
1088                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1089                  when we are not single shifting.  */
1090               if (!single_shifting
1091                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1092                 {
1093                   int i = 1;
1094                   while (src < src_end)
1095                     {
1096                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1097                       if (c < 0xA0)
1098                         break;
1099                       i++;
1100                     }
1101
1102                   if (i & 1 && src < src_end)
1103                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1104                   else
1105                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1106                 }
1107             }
1108           break;
1109         }
1110     }
1111  label_end_of_loop:
1112   return (mask & mask_found);
1113 }
1114
1115 /* Decode a character of which charset is CHARSET, the 1st position
1116    code is C1, the 2nd position code is C2, and return the decoded
1117    character code.  If the variable `translation_table' is non-nil,
1118    returned the translated code.  */
1119
1120 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1121   (NILP (translation_table)                     \
1122    ? MAKE_CHAR (charset, c1, c2)                \
1123    : translate_char (translation_table, -1, charset, c1, c2))
1124
1125 /* Set designation state into CODING.  */
1126 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1127   do {                                                                     \
1128     int charset, c;                                                        \
1129                                                                            \
1130     if (final_char < '0' || final_char >= 128)                             \
1131       goto label_invalid_code;                                             \
1132     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1133                                  make_number (chars),                      \
1134                                  make_number (final_char));                \
1135     c = MAKE_CHAR (charset, 0, 0);                                         \
1136     if (charset >= 0                                                       \
1137         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1138             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1139       {                                                                    \
1140         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1141             && reg == 0                                                    \
1142             && charset == CHARSET_ASCII)                                   \
1143           {                                                                \
1144             /* We should insert this designation sequence as is so         \
1145                that it is surely written back to a file.  */               \
1146             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1147             goto label_invalid_code;                                       \
1148           }                                                                \
1149         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1150         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1151             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1152           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1153         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1154       }                                                                    \
1155     else                                                                   \
1156       {                                                                    \
1157         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1158         goto label_invalid_code;                                           \
1159       }                                                                    \
1160   } while (0)
1161
1162 /* Allocate a memory block for storing information about compositions.
1163    The block is chained to the already allocated blocks.  */
1164
1165 void
1166 coding_allocate_composition_data (coding, char_offset)
1167      struct coding_system *coding;
1168      int char_offset;
1169 {
1170   struct composition_data *cmp_data
1171     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1172
1173   cmp_data->char_offset = char_offset;
1174   cmp_data->used = 0;
1175   cmp_data->prev = coding->cmp_data;
1176   cmp_data->next = NULL;
1177   if (coding->cmp_data)
1178     coding->cmp_data->next = cmp_data;
1179   coding->cmp_data = cmp_data;
1180   coding->cmp_data_start = 0;
1181 }
1182
1183 /* Record the starting position START and METHOD of one composition.  */
1184
1185 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1186   do {                                                          \
1187     struct composition_data *cmp_data = coding->cmp_data;       \
1188     int *data = cmp_data->data + cmp_data->used;                \
1189     coding->cmp_data_start = cmp_data->used;                    \
1190     data[0] = -1;                                               \
1191     data[1] = cmp_data->char_offset + start;                    \
1192     data[3] = (int) method;                                     \
1193     cmp_data->used += 4;                                        \
1194   } while (0)
1195
1196 /* Record the ending position END of the current composition.  */
1197
1198 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1199   do {                                                          \
1200     struct composition_data *cmp_data = coding->cmp_data;       \
1201     int *data = cmp_data->data + coding->cmp_data_start;        \
1202     data[0] = cmp_data->used - coding->cmp_data_start;          \
1203     data[2] = cmp_data->char_offset + end;                      \
1204   } while (0)
1205
1206 /* Record one COMPONENT (alternate character or composition rule).  */
1207
1208 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1209   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1210
1211 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1212
1213 #define DECODE_COMPOSITION_START(c1)                                       \
1214   do {                                                                     \
1215     if (coding->composing == COMPOSITION_DISABLED)                         \
1216       {                                                                    \
1217         *dst++ = ISO_CODE_ESC;                                             \
1218         *dst++ = c1 & 0x7f;                                                \
1219         coding->produced_char += 2;                                        \
1220       }                                                                    \
1221     else if (!COMPOSING_P (coding))                                        \
1222       {                                                                    \
1223         /* This is surely the start of a composition.  We must be sure     \
1224            that coding->cmp_data has enough space to store the             \
1225            information about the composition.  If not, terminate the       \
1226            current decoding loop, allocate one more memory block for       \
1227            coding->cmp_data in the calller, then start the decoding        \
1228            loop again.  We can't allocate memory here directly because     \
1229            it may cause buffer/string relocation.  */                      \
1230         if (!coding->cmp_data                                              \
1231             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1232                 >= COMPOSITION_DATA_SIZE))                                 \
1233           {                                                                \
1234             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1235             goto label_end_of_loop;                                        \
1236           }                                                                \
1237         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1238                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1239                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1240                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1241         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1242                                       coding->composing);                  \
1243         coding->composition_rule_follows = 0;                              \
1244       }                                                                    \
1245     else                                                                   \
1246       {                                                                    \
1247         /* We are already handling a composition.  If the method is        \
1248            the following two, the codes following the current escape       \
1249            sequence are actual characters stored in a buffer.  */          \
1250         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1251             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1252           {                                                                \
1253             coding->composing = COMPOSITION_RELATIVE;                      \
1254             coding->composition_rule_follows = 0;                          \
1255           }                                                                \
1256       }                                                                    \
1257   } while (0)
1258
1259 /* Handle compositoin end sequence ESC 1.  */
1260
1261 #define DECODE_COMPOSITION_END(c1)                                      \
1262   do {                                                                  \
1263     if (coding->composing == COMPOSITION_DISABLED)                      \
1264       {                                                                 \
1265         *dst++ = ISO_CODE_ESC;                                          \
1266         *dst++ = c1;                                                    \
1267         coding->produced_char += 2;                                     \
1268       }                                                                 \
1269     else                                                                \
1270       {                                                                 \
1271         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1272         coding->composing = COMPOSITION_NO;                             \
1273       }                                                                 \
1274   } while (0)
1275
1276 /* Decode a composition rule from the byte C1 (and maybe one more byte
1277    from SRC) and store one encoded composition rule in
1278    coding->cmp_data.  */
1279
1280 #define DECODE_COMPOSITION_RULE(c1)                                     \
1281   do {                                                                  \
1282     int rule = 0;                                                       \
1283     (c1) -= 32;                                                         \
1284     if (c1 < 81)                /* old format (before ver.21) */        \
1285       {                                                                 \
1286         int gref = (c1) / 9;                                            \
1287         int nref = (c1) % 9;                                            \
1288         if (gref == 4) gref = 10;                                       \
1289         if (nref == 4) nref = 10;                                       \
1290         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1291       }                                                                 \
1292     else if (c1 < 93)           /* new format (after ver.21) */         \
1293       {                                                                 \
1294         ONE_MORE_BYTE (c2);                                             \
1295         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1296       }                                                                 \
1297     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1298     coding->composition_rule_follows = 0;                               \
1299   } while (0)
1300
1301
1302 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1303
1304 static void
1305 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1306      struct coding_system *coding;
1307      unsigned char *source, *destination;
1308      int src_bytes, dst_bytes;
1309 {
1310   unsigned char *src = source;
1311   unsigned char *src_end = source + src_bytes;
1312   unsigned char *dst = destination;
1313   unsigned char *dst_end = destination + dst_bytes;
1314   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1315   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1316   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1317   /* SRC_BASE remembers the start position in source in each loop.
1318      The loop will be exited when there's not enough source code
1319      (within macro ONE_MORE_BYTE), or when there's not enough
1320      destination area to produce a character (within macro
1321      EMIT_CHAR).  */
1322   unsigned char *src_base;
1323   int c, charset;
1324   Lisp_Object translation_table;
1325   Lisp_Object safe_chars;
1326
1327   safe_chars = coding_safe_chars (coding);
1328
1329   if (NILP (Venable_character_translation))
1330     translation_table = Qnil;
1331   else
1332     {
1333       translation_table = coding->translation_table_for_decode;
1334       if (NILP (translation_table))
1335         translation_table = Vstandard_translation_table_for_decode;
1336     }
1337
1338   coding->result = CODING_FINISH_NORMAL;
1339
1340   while (1)
1341     {
1342       int c1, c2;
1343
1344       src_base = src;
1345       ONE_MORE_BYTE (c1);
1346
1347       /* We produce no character or one character.  */
1348       switch (iso_code_class [c1])
1349         {
1350         case ISO_0x20_or_0x7F:
1351           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1352             {
1353               DECODE_COMPOSITION_RULE (c1);
1354               continue;
1355             }
1356           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1357             {
1358               /* This is SPACE or DEL.  */
1359               charset = CHARSET_ASCII;
1360               break;
1361             }
1362           /* This is a graphic character, we fall down ...  */
1363
1364         case ISO_graphic_plane_0:
1365           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1366             {
1367               DECODE_COMPOSITION_RULE (c1);
1368               continue;
1369             }
1370           charset = charset0;
1371           break;
1372
1373         case ISO_0xA0_or_0xFF:
1374           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1375               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1376             goto label_invalid_code;
1377           /* This is a graphic character, we fall down ... */
1378
1379         case ISO_graphic_plane_1:
1380           if (charset1 < 0)
1381             goto label_invalid_code;
1382           charset = charset1;
1383           break;
1384
1385         case ISO_control_0:
1386           if (COMPOSING_P (coding))
1387             DECODE_COMPOSITION_END ('1');
1388
1389           /* All ISO2022 control characters in this class have the
1390              same representation in Emacs internal format.  */
1391           if (c1 == '\n'
1392               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1393               && (coding->eol_type == CODING_EOL_CR
1394                   || coding->eol_type == CODING_EOL_CRLF))
1395             {
1396               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1397               goto label_end_of_loop;
1398             }
1399           charset = CHARSET_ASCII;
1400           break;
1401
1402         case ISO_control_1:
1403           if (COMPOSING_P (coding))
1404             DECODE_COMPOSITION_END ('1');
1405           goto label_invalid_code;
1406
1407         case ISO_carriage_return:
1408           if (COMPOSING_P (coding))
1409             DECODE_COMPOSITION_END ('1');
1410
1411           if (coding->eol_type == CODING_EOL_CR)
1412             c1 = '\n';
1413           else if (coding->eol_type == CODING_EOL_CRLF)
1414             {
1415               ONE_MORE_BYTE (c1);
1416               if (c1 != ISO_CODE_LF)
1417                 {
1418                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1419                     {
1420                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1421                       goto label_end_of_loop;
1422                     }
1423                   src--;
1424                   c1 = '\r';
1425                 }
1426             }
1427           charset = CHARSET_ASCII;
1428           break;
1429
1430         case ISO_shift_out:
1431           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1432               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1433             goto label_invalid_code;
1434           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1435           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1436           continue;
1437
1438         case ISO_shift_in:
1439           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1440             goto label_invalid_code;
1441           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1442           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1443           continue;
1444
1445         case ISO_single_shift_2_7:
1446         case ISO_single_shift_2:
1447           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1448             goto label_invalid_code;
1449           /* SS2 is handled as an escape sequence of ESC 'N' */
1450           c1 = 'N';
1451           goto label_escape_sequence;
1452
1453         case ISO_single_shift_3:
1454           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1455             goto label_invalid_code;
1456           /* SS2 is handled as an escape sequence of ESC 'O' */
1457           c1 = 'O';
1458           goto label_escape_sequence;
1459
1460         case ISO_control_sequence_introducer:
1461           /* CSI is handled as an escape sequence of ESC '[' ...  */
1462           c1 = '[';
1463           goto label_escape_sequence;
1464
1465         case ISO_escape:
1466           ONE_MORE_BYTE (c1);
1467         label_escape_sequence:
1468           /* Escape sequences handled by Emacs are invocation,
1469              designation, direction specification, and character
1470              composition specification.  */
1471           switch (c1)
1472             {
1473             case '&':           /* revision of following character set */
1474               ONE_MORE_BYTE (c1);
1475               if (!(c1 >= '@' && c1 <= '~'))
1476                 goto label_invalid_code;
1477               ONE_MORE_BYTE (c1);
1478               if (c1 != ISO_CODE_ESC)
1479                 goto label_invalid_code;
1480               ONE_MORE_BYTE (c1);
1481               goto label_escape_sequence;
1482
1483             case '$':           /* designation of 2-byte character set */
1484               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1485                 goto label_invalid_code;
1486               ONE_MORE_BYTE (c1);
1487               if (c1 >= '@' && c1 <= 'B')
1488                 {       /* designation of JISX0208.1978, GB2312.1980,
1489                            or JISX0208.1980 */
1490                   DECODE_DESIGNATION (0, 2, 94, c1);
1491                 }
1492               else if (c1 >= 0x28 && c1 <= 0x2B)
1493                 {       /* designation of DIMENSION2_CHARS94 character set */
1494                   ONE_MORE_BYTE (c2);
1495                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1496                 }
1497               else if (c1 >= 0x2C && c1 <= 0x2F)
1498                 {       /* designation of DIMENSION2_CHARS96 character set */
1499                   ONE_MORE_BYTE (c2);
1500                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1501                 }
1502               else
1503                 goto label_invalid_code;
1504               /* We must update these variables now.  */
1505               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1506               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1507               continue;
1508
1509             case 'n':           /* invocation of locking-shift-2 */
1510               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1511                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1512                 goto label_invalid_code;
1513               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1514               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1515               continue;
1516
1517             case 'o':           /* invocation of locking-shift-3 */
1518               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1519                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1520                 goto label_invalid_code;
1521               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1522               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1523               continue;
1524
1525             case 'N':           /* invocation of single-shift-2 */
1526               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1527                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1528                 goto label_invalid_code;
1529               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1530               ONE_MORE_BYTE (c1);
1531               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1532                 goto label_invalid_code;
1533               break;
1534
1535             case 'O':           /* invocation of single-shift-3 */
1536               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1537                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1538                 goto label_invalid_code;
1539               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1540               ONE_MORE_BYTE (c1);
1541               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1542                 goto label_invalid_code;
1543               break;
1544
1545             case '0': case '2': case '3': case '4': /* start composition */
1546               DECODE_COMPOSITION_START (c1);
1547               continue;
1548
1549             case '1':           /* end composition */
1550               DECODE_COMPOSITION_END (c1);
1551               continue;
1552
1553             case '[':           /* specification of direction */
1554               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1555                 goto label_invalid_code;
1556               /* For the moment, nested direction is not supported.
1557                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1558                  left-to-right, and nozero means right-to-left.  */
1559               ONE_MORE_BYTE (c1);
1560               switch (c1)
1561                 {
1562                 case ']':       /* end of the current direction */
1563                   coding->mode &= ~CODING_MODE_DIRECTION;
1564
1565                 case '0':       /* end of the current direction */
1566                 case '1':       /* start of left-to-right direction */
1567                   ONE_MORE_BYTE (c1);
1568                   if (c1 == ']')
1569                     coding->mode &= ~CODING_MODE_DIRECTION;
1570                   else
1571                     goto label_invalid_code;
1572                   break;
1573
1574                 case '2':       /* start of right-to-left direction */
1575                   ONE_MORE_BYTE (c1);
1576                   if (c1 == ']')
1577                     coding->mode |= CODING_MODE_DIRECTION;
1578                   else
1579                     goto label_invalid_code;
1580                   break;
1581
1582                 default:
1583                   goto label_invalid_code;
1584                 }
1585               continue;
1586
1587             default:
1588               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1589                 goto label_invalid_code;
1590               if (c1 >= 0x28 && c1 <= 0x2B)
1591                 {       /* designation of DIMENSION1_CHARS94 character set */
1592                   ONE_MORE_BYTE (c2);
1593                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1594                 }
1595               else if (c1 >= 0x2C && c1 <= 0x2F)
1596                 {       /* designation of DIMENSION1_CHARS96 character set */
1597                   ONE_MORE_BYTE (c2);
1598                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1599                 }
1600               else
1601                 goto label_invalid_code;
1602               /* We must update these variables now.  */
1603               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1604               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1605               continue;
1606             }
1607         }
1608
1609       /* Now we know CHARSET and 1st position code C1 of a character.
1610          Produce a multibyte sequence for that character while getting
1611          2nd position code C2 if necessary.  */
1612       if (CHARSET_DIMENSION (charset) == 2)
1613         {
1614           ONE_MORE_BYTE (c2);
1615           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1616             /* C2 is not in a valid range.  */
1617             goto label_invalid_code;
1618         }
1619       c = DECODE_ISO_CHARACTER (charset, c1, c2);
1620       EMIT_CHAR (c);
1621       continue;
1622
1623     label_invalid_code:
1624       coding->errors++;
1625       if (COMPOSING_P (coding))
1626         DECODE_COMPOSITION_END ('1');
1627       src = src_base;
1628       c = *src++;
1629       EMIT_CHAR (c);
1630     }
1631
1632  label_end_of_loop:
1633   coding->consumed = coding->consumed_char = src_base - source;
1634   coding->produced = dst - destination;
1635   return;
1636 }
1637
1638
1639 /* ISO2022 encoding stuff.  */
1640
1641 /*
1642    It is not enough to say just "ISO2022" on encoding, we have to
1643    specify more details.  In Emacs, each coding system of ISO2022
1644    variant has the following specifications:
1645         1. Initial designation to G0 thru G3.
1646         2. Allows short-form designation?
1647         3. ASCII should be designated to G0 before control characters?
1648         4. ASCII should be designated to G0 at end of line?
1649         5. 7-bit environment or 8-bit environment?
1650         6. Use locking-shift?
1651         7. Use Single-shift?
1652    And the following two are only for Japanese:
1653         8. Use ASCII in place of JIS0201-1976-Roman?
1654         9. Use JISX0208-1983 in place of JISX0208-1978?
1655    These specifications are encoded in `coding->flags' as flag bits
1656    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1657    details.
1658 */
1659
1660 /* Produce codes (escape sequence) for designating CHARSET to graphic
1661    register REG at DST, and increment DST.  If <final-char> of CHARSET is
1662    '@', 'A', or 'B' and the coding system CODING allows, produce
1663    designation sequence of short-form.  */
1664
1665 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1666   do {                                                                  \
1667     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1668     char *intermediate_char_94 = "()*+";                                \
1669     char *intermediate_char_96 = ",-./";                                \
1670     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1671                                                                         \
1672     if (revision < 255)                                                 \
1673       {                                                                 \
1674         *dst++ = ISO_CODE_ESC;                                          \
1675         *dst++ = '&';                                                   \
1676         *dst++ = '@' + revision;                                        \
1677       }                                                                 \
1678     *dst++ = ISO_CODE_ESC;                                              \
1679     if (CHARSET_DIMENSION (charset) == 1)                               \
1680       {                                                                 \
1681         if (CHARSET_CHARS (charset) == 94)                              \
1682           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1683         else                                                            \
1684           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1685       }                                                                 \
1686     else                                                                \
1687       {                                                                 \
1688         *dst++ = '$';                                                   \
1689         if (CHARSET_CHARS (charset) == 94)                              \
1690           {                                                             \
1691             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1692                 || reg != 0                                             \
1693                 || final_char < '@' || final_char > 'B')                \
1694               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1695           }                                                             \
1696         else                                                            \
1697           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1698       }                                                                 \
1699     *dst++ = final_char;                                                \
1700     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1701   } while (0)
1702
1703 /* The following two macros produce codes (control character or escape
1704    sequence) for ISO2022 single-shift functions (single-shift-2 and
1705    single-shift-3).  */
1706
1707 #define ENCODE_SINGLE_SHIFT_2                           \
1708   do {                                                  \
1709     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1710       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1711     else                                                \
1712       *dst++ = ISO_CODE_SS2;                            \
1713     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1714   } while (0)
1715
1716 #define ENCODE_SINGLE_SHIFT_3                           \
1717   do {                                                  \
1718     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1719       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1720     else                                                \
1721       *dst++ = ISO_CODE_SS3;                            \
1722     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1723   } while (0)
1724
1725 /* The following four macros produce codes (control character or
1726    escape sequence) for ISO2022 locking-shift functions (shift-in,
1727    shift-out, locking-shift-2, and locking-shift-3).  */
1728
1729 #define ENCODE_SHIFT_IN                         \
1730   do {                                          \
1731     *dst++ = ISO_CODE_SI;                       \
1732     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1733   } while (0)
1734
1735 #define ENCODE_SHIFT_OUT                        \
1736   do {                                          \
1737     *dst++ = ISO_CODE_SO;                       \
1738     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1739   } while (0)
1740
1741 #define ENCODE_LOCKING_SHIFT_2                  \
1742   do {                                          \
1743     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1744     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1745   } while (0)
1746
1747 #define ENCODE_LOCKING_SHIFT_3                  \
1748   do {                                          \
1749     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1750     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1751   } while (0)
1752
1753 /* Produce codes for a DIMENSION1 character whose character set is
1754    CHARSET and whose position-code is C1.  Designation and invocation
1755    sequences are also produced in advance if necessary.  */
1756
1757 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1758   do {                                                                  \
1759     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1760       {                                                                 \
1761         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1762           *dst++ = c1 & 0x7F;                                           \
1763         else                                                            \
1764           *dst++ = c1 | 0x80;                                           \
1765         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1766         break;                                                          \
1767       }                                                                 \
1768     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1769       {                                                                 \
1770         *dst++ = c1 & 0x7F;                                             \
1771         break;                                                          \
1772       }                                                                 \
1773     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1774       {                                                                 \
1775         *dst++ = c1 | 0x80;                                             \
1776         break;                                                          \
1777       }                                                                 \
1778     else                                                                \
1779       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1780          must invoke it, or, at first, designate it to some graphic     \
1781          register.  Then repeat the loop to actually produce the        \
1782          character.  */                                                 \
1783       dst = encode_invocation_designation (charset, coding, dst);       \
1784   } while (1)
1785
1786 /* Produce codes for a DIMENSION2 character whose character set is
1787    CHARSET and whose position-codes are C1 and C2.  Designation and
1788    invocation codes are also produced in advance if necessary.  */
1789
1790 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1791   do {                                                                  \
1792     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1793       {                                                                 \
1794         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1795           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1796         else                                                            \
1797           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1798         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1799         break;                                                          \
1800       }                                                                 \
1801     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1802       {                                                                 \
1803         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1804         break;                                                          \
1805       }                                                                 \
1806     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1807       {                                                                 \
1808         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1809         break;                                                          \
1810       }                                                                 \
1811     else                                                                \
1812       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1813          must invoke it, or, at first, designate it to some graphic     \
1814          register.  Then repeat the loop to actually produce the        \
1815          character.  */                                                 \
1816       dst = encode_invocation_designation (charset, coding, dst);       \
1817   } while (1)
1818
1819 #define ENCODE_ISO_CHARACTER(c)                                 \
1820   do {                                                          \
1821     int charset, c1, c2;                                        \
1822                                                                 \
1823     SPLIT_CHAR (c, charset, c1, c2);                            \
1824     if (CHARSET_DEFINED_P (charset))                            \
1825       {                                                         \
1826         if (CHARSET_DIMENSION (charset) == 1)                   \
1827           {                                                     \
1828             if (charset == CHARSET_ASCII                        \
1829                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
1830               charset = charset_latin_jisx0201;                 \
1831             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
1832           }                                                     \
1833         else                                                    \
1834           {                                                     \
1835             if (charset == charset_jisx0208                     \
1836                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
1837               charset = charset_jisx0208_1978;                  \
1838             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
1839           }                                                     \
1840       }                                                         \
1841     else                                                        \
1842       {                                                         \
1843         *dst++ = c1;                                            \
1844         if (c2 >= 0)                                            \
1845           *dst++ = c2;                                          \
1846       }                                                         \
1847   } while (0)
1848
1849
1850 /* Instead of encoding character C, produce one or two `?'s.  */
1851
1852 #define ENCODE_UNSAFE_CHARACTER(c)                                      \
1853   do {                                                                  \
1854     ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);       \
1855     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                           \
1856       ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);     \
1857   } while (0)
1858
1859
1860 /* Produce designation and invocation codes at a place pointed by DST
1861    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1862    Return new DST.  */
1863
1864 unsigned char *
1865 encode_invocation_designation (charset, coding, dst)
1866      int charset;
1867      struct coding_system *coding;
1868      unsigned char *dst;
1869 {
1870   int reg;                      /* graphic register number */
1871
1872   /* At first, check designations.  */
1873   for (reg = 0; reg < 4; reg++)
1874     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1875       break;
1876
1877   if (reg >= 4)
1878     {
1879       /* CHARSET is not yet designated to any graphic registers.  */
1880       /* At first check the requested designation.  */
1881       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1882       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1883         /* Since CHARSET requests no special designation, designate it
1884            to graphic register 0.  */
1885         reg = 0;
1886
1887       ENCODE_DESIGNATION (charset, reg, coding);
1888     }
1889
1890   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1891       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1892     {
1893       /* Since the graphic register REG is not invoked to any graphic
1894          planes, invoke it to graphic plane 0.  */
1895       switch (reg)
1896         {
1897         case 0:                 /* graphic register 0 */
1898           ENCODE_SHIFT_IN;
1899           break;
1900
1901         case 1:                 /* graphic register 1 */
1902           ENCODE_SHIFT_OUT;
1903           break;
1904
1905         case 2:                 /* graphic register 2 */
1906           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1907             ENCODE_SINGLE_SHIFT_2;
1908           else
1909             ENCODE_LOCKING_SHIFT_2;
1910           break;
1911
1912         case 3:                 /* graphic register 3 */
1913           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1914             ENCODE_SINGLE_SHIFT_3;
1915           else
1916             ENCODE_LOCKING_SHIFT_3;
1917           break;
1918         }
1919     }
1920
1921   return dst;
1922 }
1923
1924 /* Produce 2-byte codes for encoded composition rule RULE.  */
1925
1926 #define ENCODE_COMPOSITION_RULE(rule)           \
1927   do {                                          \
1928     int gref, nref;                             \
1929     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1930     *dst++ = 32 + 81 + gref;                    \
1931     *dst++ = 32 + nref;                         \
1932   } while (0)
1933
1934 /* Produce codes for indicating the start of a composition sequence
1935    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1936    which specify information about the composition.  See the comment
1937    in coding.h for the format of DATA.  */
1938
1939 #define ENCODE_COMPOSITION_START(coding, data)                          \
1940   do {                                                                  \
1941     coding->composing = data[3];                                        \
1942     *dst++ = ISO_CODE_ESC;                                              \
1943     if (coding->composing == COMPOSITION_RELATIVE)                      \
1944       *dst++ = '0';                                                     \
1945     else                                                                \
1946       {                                                                 \
1947         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1948                   ? '3' : '4');                                         \
1949         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1950         coding->composition_rule_follows = 0;                           \
1951       }                                                                 \
1952   } while (0)
1953
1954 /* Produce codes for indicating the end of the current composition.  */
1955
1956 #define ENCODE_COMPOSITION_END(coding, data)                    \
1957   do {                                                          \
1958     *dst++ = ISO_CODE_ESC;                                      \
1959     *dst++ = '1';                                               \
1960     coding->cmp_data_start += data[0];                          \
1961     coding->composing = COMPOSITION_NO;                         \
1962     if (coding->cmp_data_start == coding->cmp_data->used        \
1963         && coding->cmp_data->next)                              \
1964       {                                                         \
1965         coding->cmp_data = coding->cmp_data->next;              \
1966         coding->cmp_data_start = 0;                             \
1967       }                                                         \
1968   } while (0)
1969
1970 /* Produce composition start sequence ESC 0.  Here, this sequence
1971    doesn't mean the start of a new composition but means that we have
1972    just produced components (alternate chars and composition rules) of
1973    the composition and the actual text follows in SRC.  */
1974
1975 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1976   do {                                          \
1977     *dst++ = ISO_CODE_ESC;                      \
1978     *dst++ = '0';                               \
1979     coding->composing = COMPOSITION_RELATIVE;   \
1980   } while (0)
1981
1982 /* The following three macros produce codes for indicating direction
1983    of text.  */
1984 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1985   do {                                                  \
1986     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1987       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1988     else                                                \
1989       *dst++ = ISO_CODE_CSI;                            \
1990   } while (0)
1991
1992 #define ENCODE_DIRECTION_R2L    \
1993   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1994
1995 #define ENCODE_DIRECTION_L2R    \
1996   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1997
1998 /* Produce codes for designation and invocation to reset the graphic
1999    planes and registers to initial state.  */
2000 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2001   do {                                                                      \
2002     int reg;                                                                \
2003     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2004       ENCODE_SHIFT_IN;                                                      \
2005     for (reg = 0; reg < 4; reg++)                                           \
2006       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2007           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2008               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2009         ENCODE_DESIGNATION                                                  \
2010           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2011   } while (0)
2012
2013 /* Produce designation sequences of charsets in the line started from
2014    SRC to a place pointed by DST, and return updated DST.
2015
2016    If the current block ends before any end-of-line, we may fail to
2017    find all the necessary designations.  */
2018
2019 static unsigned char *
2020 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2021      struct coding_system *coding;
2022      Lisp_Object translation_table;
2023      unsigned char *src, *src_end, *dst;
2024 {
2025   int charset, c, found = 0, reg;
2026   /* Table of charsets to be designated to each graphic register.  */
2027   int r[4];
2028
2029   for (reg = 0; reg < 4; reg++)
2030     r[reg] = -1;
2031
2032   while (found < 4)
2033     {
2034       ONE_MORE_CHAR (c);
2035       if (c == '\n')
2036         break;
2037
2038       charset = CHAR_CHARSET (c);
2039       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2040       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2041         {
2042           found++;
2043           r[reg] = charset;
2044         }
2045     }
2046
2047  label_end_of_loop:
2048   if (found)
2049     {
2050       for (reg = 0; reg < 4; reg++)
2051         if (r[reg] >= 0
2052             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2053           ENCODE_DESIGNATION (r[reg], reg, coding);
2054     }
2055
2056   return dst;
2057 }
2058
2059 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2060
2061 static void
2062 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2063      struct coding_system *coding;
2064      unsigned char *source, *destination;
2065      int src_bytes, dst_bytes;
2066 {
2067   unsigned char *src = source;
2068   unsigned char *src_end = source + src_bytes;
2069   unsigned char *dst = destination;
2070   unsigned char *dst_end = destination + dst_bytes;
2071   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2072      from DST_END to assure overflow checking is necessary only at the
2073      head of loop.  */
2074   unsigned char *adjusted_dst_end = dst_end - 19;
2075   /* SRC_BASE remembers the start position in source in each loop.
2076      The loop will be exited when there's not enough source text to
2077      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2078      there's not enough destination area to produce encoded codes
2079      (within macro EMIT_BYTES).  */
2080   unsigned char *src_base;
2081   int c;
2082   Lisp_Object translation_table;
2083   Lisp_Object safe_chars;
2084
2085   safe_chars = coding_safe_chars (coding);
2086
2087   if (NILP (Venable_character_translation))
2088     translation_table = Qnil;
2089   else
2090     {
2091       translation_table = coding->translation_table_for_encode;
2092       if (NILP (translation_table))
2093         translation_table = Vstandard_translation_table_for_encode;
2094     }
2095
2096   coding->consumed_char = 0;
2097   coding->errors = 0;
2098   while (1)
2099     {
2100       src_base = src;
2101
2102       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2103         {
2104           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2105           break;
2106         }
2107
2108       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2109           && CODING_SPEC_ISO_BOL (coding))
2110         {
2111           /* We have to produce designation sequences if any now.  */
2112           dst = encode_designation_at_bol (coding, translation_table,
2113                                            src, src_end, dst);
2114           CODING_SPEC_ISO_BOL (coding) = 0;
2115         }
2116
2117       /* Check composition start and end.  */
2118       if (coding->composing != COMPOSITION_DISABLED
2119           && coding->cmp_data_start < coding->cmp_data->used)
2120         {
2121           struct composition_data *cmp_data = coding->cmp_data;
2122           int *data = cmp_data->data + coding->cmp_data_start;
2123           int this_pos = cmp_data->char_offset + coding->consumed_char;
2124
2125           if (coding->composing == COMPOSITION_RELATIVE)
2126             {
2127               if (this_pos == data[2])
2128                 {
2129                   ENCODE_COMPOSITION_END (coding, data);
2130                   cmp_data = coding->cmp_data;
2131                   data = cmp_data->data + coding->cmp_data_start;
2132                 }
2133             }
2134           else if (COMPOSING_P (coding))
2135             {
2136               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2137               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2138                 /* We have consumed components of the composition.
2139                    What follows in SRC is the compositions's base
2140                    text.  */
2141                 ENCODE_COMPOSITION_FAKE_START (coding);
2142               else
2143                 {
2144                   int c = cmp_data->data[coding->cmp_data_index++];
2145                   if (coding->composition_rule_follows)
2146                     {
2147                       ENCODE_COMPOSITION_RULE (c);
2148                       coding->composition_rule_follows = 0;
2149                     }
2150                   else
2151                     {
2152                       if (coding->flags & CODING_FLAG_ISO_SAFE
2153                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2154                         ENCODE_UNSAFE_CHARACTER (c);
2155                       else
2156                         ENCODE_ISO_CHARACTER (c);
2157                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2158                         coding->composition_rule_follows = 1;
2159                     }
2160                   continue;
2161                 }
2162             }
2163           if (!COMPOSING_P (coding))
2164             {
2165               if (this_pos == data[1])
2166                 {
2167                   ENCODE_COMPOSITION_START (coding, data);
2168                   continue;
2169                 }
2170             }
2171         }
2172
2173       ONE_MORE_CHAR (c);
2174
2175       /* Now encode the character C.  */
2176       if (c < 0x20 || c == 0x7F)
2177         {
2178           if (c == '\r')
2179             {
2180               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2181                 {
2182                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2183                     ENCODE_RESET_PLANE_AND_REGISTER;
2184                   *dst++ = c;
2185                   continue;
2186                 }
2187               /* fall down to treat '\r' as '\n' ...  */
2188               c = '\n';
2189             }
2190           if (c == '\n')
2191             {
2192               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2193                 ENCODE_RESET_PLANE_AND_REGISTER;
2194               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2195                 bcopy (coding->spec.iso2022.initial_designation,
2196                        coding->spec.iso2022.current_designation,
2197                        sizeof coding->spec.iso2022.initial_designation);
2198               if (coding->eol_type == CODING_EOL_LF
2199                   || coding->eol_type == CODING_EOL_UNDECIDED)
2200                 *dst++ = ISO_CODE_LF;
2201               else if (coding->eol_type == CODING_EOL_CRLF)
2202                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2203               else
2204                 *dst++ = ISO_CODE_CR;
2205               CODING_SPEC_ISO_BOL (coding) = 1;
2206             }
2207           else
2208             {
2209               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2210                 ENCODE_RESET_PLANE_AND_REGISTER;
2211               *dst++ = c;
2212             }
2213         }
2214       else if (ASCII_BYTE_P (c))
2215         ENCODE_ISO_CHARACTER (c);
2216       else if (SINGLE_BYTE_CHAR_P (c))
2217         {
2218           *dst++ = c;
2219           coding->errors++;
2220         }
2221       else if (coding->flags & CODING_FLAG_ISO_SAFE
2222                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2223         ENCODE_UNSAFE_CHARACTER (c);
2224       else
2225         ENCODE_ISO_CHARACTER (c);
2226
2227       coding->consumed_char++;
2228     }
2229
2230  label_end_of_loop:
2231   coding->consumed = src_base - source;
2232   coding->produced = coding->produced_char = dst - destination;
2233 }
2234
2235 \f
2236 /*** 4. SJIS and BIG5 handlers ***/
2237
2238 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2239    quite widely.  So, for the moment, Emacs supports them in the bare
2240    C code.  But, in the future, they may be supported only by CCL.  */
2241
2242 /* SJIS is a coding system encoding three character sets: ASCII, right
2243    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2244    as is.  A character of charset katakana-jisx0201 is encoded by
2245    "position-code + 0x80".  A character of charset japanese-jisx0208
2246    is encoded in 2-byte but two position-codes are divided and shifted
2247    so that it fit in the range below.
2248
2249    --- CODE RANGE of SJIS ---
2250    (character set)      (range)
2251    ASCII                0x00 .. 0x7F
2252    KATAKANA-JISX0201    0xA0 .. 0xDF
2253    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2254             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2255    -------------------------------
2256
2257 */
2258
2259 /* BIG5 is a coding system encoding two character sets: ASCII and
2260    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2261    character set and is encoded in two-byte.
2262
2263    --- CODE RANGE of BIG5 ---
2264    (character set)      (range)
2265    ASCII                0x00 .. 0x7F
2266    Big5 (1st byte)      0xA1 .. 0xFE
2267         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2268    --------------------------
2269
2270    Since the number of characters in Big5 is larger than maximum
2271    characters in Emacs' charset (96x96), it can't be handled as one
2272    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2273    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2274    contains frequently used characters and the latter contains less
2275    frequently used characters.  */
2276
2277 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2278    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2279    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2280    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2281
2282 /* Number of Big5 characters which have the same code in 1st byte.  */
2283 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2284
2285 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2286   do {                                                                  \
2287     unsigned int temp                                                   \
2288       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2289     if (b1 < 0xC9)                                                      \
2290       charset = charset_big5_1;                                         \
2291     else                                                                \
2292       {                                                                 \
2293         charset = charset_big5_2;                                       \
2294         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2295       }                                                                 \
2296     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2297     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2298   } while (0)
2299
2300 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2301   do {                                                                  \
2302     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2303     if (charset == charset_big5_2)                                      \
2304       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2305     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2306     b2 = temp % BIG5_SAME_ROW;                                          \
2307     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2308   } while (0)
2309
2310 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2311    Check if a text is encoded in SJIS.  If it is, return
2312    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2313
2314 static int
2315 detect_coding_sjis (src, src_end, multibytep)
2316      unsigned char *src, *src_end;
2317      int multibytep;
2318 {
2319   int c;
2320   /* Dummy for ONE_MORE_BYTE.  */
2321   struct coding_system dummy_coding;
2322   struct coding_system *coding = &dummy_coding;
2323
2324   while (1)
2325     {
2326       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2327       if (c >= 0x81)
2328         {
2329           if (c <= 0x9F || (c >= 0xE0 && c <= 0xEF))
2330             {
2331               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2332               if (c < 0x40 || c == 0x7F || c > 0xFC)
2333                 return 0;
2334             }
2335           else if (c > 0xDF)
2336             return 0;
2337         }
2338     }
2339  label_end_of_loop:
2340   return CODING_CATEGORY_MASK_SJIS;
2341 }
2342
2343 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2344    Check if a text is encoded in BIG5.  If it is, return
2345    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2346
2347 static int
2348 detect_coding_big5 (src, src_end, multibytep)
2349      unsigned char *src, *src_end;
2350      int multibytep;
2351 {
2352   int c;
2353   /* Dummy for ONE_MORE_BYTE.  */
2354   struct coding_system dummy_coding;
2355   struct coding_system *coding = &dummy_coding;
2356
2357   while (1)
2358     {
2359       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2360       if (c >= 0xA1)
2361         {
2362           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2363           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2364             return 0;
2365         }
2366     }
2367  label_end_of_loop:
2368   return CODING_CATEGORY_MASK_BIG5;
2369 }
2370
2371 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2372    Check if a text is encoded in UTF-8.  If it is, return
2373    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2374
2375 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2376 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2377 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2378 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2379 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2380 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2381 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2382
2383 static int
2384 detect_coding_utf_8 (src, src_end, multibytep)
2385      unsigned char *src, *src_end;
2386      int multibytep;
2387 {
2388   unsigned char c;
2389   int seq_maybe_bytes;
2390   /* Dummy for ONE_MORE_BYTE.  */
2391   struct coding_system dummy_coding;
2392   struct coding_system *coding = &dummy_coding;
2393
2394   while (1)
2395     {
2396       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2397       if (UTF_8_1_OCTET_P (c))
2398         continue;
2399       else if (UTF_8_2_OCTET_LEADING_P (c))
2400         seq_maybe_bytes = 1;
2401       else if (UTF_8_3_OCTET_LEADING_P (c))
2402         seq_maybe_bytes = 2;
2403       else if (UTF_8_4_OCTET_LEADING_P (c))
2404         seq_maybe_bytes = 3;
2405       else if (UTF_8_5_OCTET_LEADING_P (c))
2406         seq_maybe_bytes = 4;
2407       else if (UTF_8_6_OCTET_LEADING_P (c))
2408         seq_maybe_bytes = 5;
2409       else
2410         return 0;
2411
2412       do
2413         {
2414           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2415           if (!UTF_8_EXTRA_OCTET_P (c))
2416             return 0;
2417           seq_maybe_bytes--;
2418         }
2419       while (seq_maybe_bytes > 0);
2420     }
2421
2422  label_end_of_loop:
2423   return CODING_CATEGORY_MASK_UTF_8;
2424 }
2425
2426 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2427    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2428    Little Endian (otherwise).  If it is, return
2429    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2430    else return 0.  */
2431
2432 #define UTF_16_INVALID_P(val)   \
2433   (((val) == 0xFFFE)            \
2434    || ((val) == 0xFFFF))
2435
2436 #define UTF_16_HIGH_SURROGATE_P(val) \
2437   (((val) & 0xD800) == 0xD800)
2438
2439 #define UTF_16_LOW_SURROGATE_P(val) \
2440   (((val) & 0xDC00) == 0xDC00)
2441
2442 static int
2443 detect_coding_utf_16 (src, src_end, multibytep)
2444      unsigned char *src, *src_end;
2445      int multibytep;
2446 {
2447   unsigned char c1, c2;
2448   /* Dummy for TWO_MORE_BYTES.  */
2449   struct coding_system dummy_coding;
2450   struct coding_system *coding = &dummy_coding;
2451
2452   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2453   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
2454
2455   if ((c1 == 0xFF) && (c2 == 0xFE))
2456     return CODING_CATEGORY_MASK_UTF_16_LE;
2457   else if ((c1 == 0xFE) && (c2 == 0xFF))
2458     return CODING_CATEGORY_MASK_UTF_16_BE;
2459
2460  label_end_of_loop:
2461   return 0;
2462 }
2463
2464 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2465    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2466
2467 static void
2468 decode_coding_sjis_big5 (coding, source, destination,
2469                          src_bytes, dst_bytes, sjis_p)
2470      struct coding_system *coding;
2471      unsigned char *source, *destination;
2472      int src_bytes, dst_bytes;
2473      int sjis_p;
2474 {
2475   unsigned char *src = source;
2476   unsigned char *src_end = source + src_bytes;
2477   unsigned char *dst = destination;
2478   unsigned char *dst_end = destination + dst_bytes;
2479   /* SRC_BASE remembers the start position in source in each loop.
2480      The loop will be exited when there's not enough source code
2481      (within macro ONE_MORE_BYTE), or when there's not enough
2482      destination area to produce a character (within macro
2483      EMIT_CHAR).  */
2484   unsigned char *src_base;
2485   Lisp_Object translation_table;
2486
2487   if (NILP (Venable_character_translation))
2488     translation_table = Qnil;
2489   else
2490     {
2491       translation_table = coding->translation_table_for_decode;
2492       if (NILP (translation_table))
2493         translation_table = Vstandard_translation_table_for_decode;
2494     }
2495
2496   coding->produced_char = 0;
2497   while (1)
2498     {
2499       int c, charset, c1, c2;
2500
2501       src_base = src;
2502       ONE_MORE_BYTE (c1);
2503
2504       if (c1 < 0x80)
2505         {
2506           charset = CHARSET_ASCII;
2507           if (c1 < 0x20)
2508             {
2509               if (c1 == '\r')
2510                 {
2511                   if (coding->eol_type == CODING_EOL_CRLF)
2512                     {
2513                       ONE_MORE_BYTE (c2);
2514                       if (c2 == '\n')
2515                         c1 = c2;
2516                       else if (coding->mode
2517                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2518                         {
2519                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2520                           goto label_end_of_loop;
2521                         }
2522                       else
2523                         /* To process C2 again, SRC is subtracted by 1.  */
2524                         src--;
2525                     }
2526                   else if (coding->eol_type == CODING_EOL_CR)
2527                     c1 = '\n';
2528                 }
2529               else if (c1 == '\n'
2530                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2531                        && (coding->eol_type == CODING_EOL_CR
2532                            || coding->eol_type == CODING_EOL_CRLF))
2533                 {
2534                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2535                   goto label_end_of_loop;
2536                 }
2537             }
2538         }
2539       else
2540         {
2541           if (sjis_p)
2542             {
2543               if (c1 >= 0xF0)
2544                 goto label_invalid_code;
2545               if (c1 < 0xA0 || c1 >= 0xE0)
2546                 {
2547                   /* SJIS -> JISX0208 */
2548                   ONE_MORE_BYTE (c2);
2549                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2550                     goto label_invalid_code;
2551                   DECODE_SJIS (c1, c2, c1, c2);
2552                   charset = charset_jisx0208;
2553                 }
2554               else
2555                 /* SJIS -> JISX0201-Kana */
2556                 charset = charset_katakana_jisx0201;
2557             }
2558           else
2559             {
2560               /* BIG5 -> Big5 */
2561               if (c1 < 0xA1 || c1 > 0xFE)
2562                 goto label_invalid_code;
2563               ONE_MORE_BYTE (c2);
2564               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2565                 goto label_invalid_code;
2566               DECODE_BIG5 (c1, c2, charset, c1, c2);
2567             }
2568         }
2569
2570       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2571       EMIT_CHAR (c);
2572       continue;
2573
2574     label_invalid_code:
2575       coding->errors++;
2576       src = src_base;
2577       c = *src++;
2578       EMIT_CHAR (c);
2579     }
2580
2581  label_end_of_loop:
2582   coding->consumed = coding->consumed_char = src_base - source;
2583   coding->produced = dst - destination;
2584   return;
2585 }
2586
2587 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2588    This function can encode charsets `ascii', `katakana-jisx0201',
2589    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2590    are sure that all these charsets are registered as official charset
2591    (i.e. do not have extended leading-codes).  Characters of other
2592    charsets are produced without any encoding.  If SJIS_P is 1, encode
2593    SJIS text, else encode BIG5 text.  */
2594
2595 static void
2596 encode_coding_sjis_big5 (coding, source, destination,
2597                          src_bytes, dst_bytes, sjis_p)
2598      struct coding_system *coding;
2599      unsigned char *source, *destination;
2600      int src_bytes, dst_bytes;
2601      int sjis_p;
2602 {
2603   unsigned char *src = source;
2604   unsigned char *src_end = source + src_bytes;
2605   unsigned char *dst = destination;
2606   unsigned char *dst_end = destination + dst_bytes;
2607   /* SRC_BASE remembers the start position in source in each loop.
2608      The loop will be exited when there's not enough source text to
2609      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2610      there's not enough destination area to produce encoded codes
2611      (within macro EMIT_BYTES).  */
2612   unsigned char *src_base;
2613   Lisp_Object translation_table;
2614
2615   if (NILP (Venable_character_translation))
2616     translation_table = Qnil;
2617   else
2618     {
2619       translation_table = coding->translation_table_for_encode;
2620       if (NILP (translation_table))
2621         translation_table = Vstandard_translation_table_for_encode;
2622     }
2623
2624   while (1)
2625     {
2626       int c, charset, c1, c2;
2627
2628       src_base = src;
2629       ONE_MORE_CHAR (c);
2630
2631       /* Now encode the character C.  */
2632       if (SINGLE_BYTE_CHAR_P (c))
2633         {
2634           switch (c)
2635             {
2636             case '\r':
2637               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2638                 {
2639                   EMIT_ONE_BYTE (c);
2640                   break;
2641                 }
2642               c = '\n';
2643             case '\n':
2644               if (coding->eol_type == CODING_EOL_CRLF)
2645                 {
2646                   EMIT_TWO_BYTES ('\r', c);
2647                   break;
2648                 }
2649               else if (coding->eol_type == CODING_EOL_CR)
2650                 c = '\r';
2651             default:
2652               EMIT_ONE_BYTE (c);
2653             }
2654         }
2655       else
2656         {
2657           SPLIT_CHAR (c, charset, c1, c2);
2658           if (sjis_p)
2659             {
2660               if (charset == charset_jisx0208
2661                   || charset == charset_jisx0208_1978)
2662                 {
2663                   ENCODE_SJIS (c1, c2, c1, c2);
2664                   EMIT_TWO_BYTES (c1, c2);
2665                 }
2666               else if (charset == charset_katakana_jisx0201)
2667                 EMIT_ONE_BYTE (c1 | 0x80);
2668               else if (charset == charset_latin_jisx0201)
2669                 EMIT_ONE_BYTE (c1);
2670               else
2671                 /* There's no way other than producing the internal
2672                    codes as is.  */
2673                 EMIT_BYTES (src_base, src);
2674             }
2675           else
2676             {
2677               if (charset == charset_big5_1 || charset == charset_big5_2)
2678                 {
2679                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
2680                   EMIT_TWO_BYTES (c1, c2);
2681                 }
2682               else
2683                 /* There's no way other than producing the internal
2684                    codes as is.  */
2685                 EMIT_BYTES (src_base, src);
2686             }
2687         }
2688       coding->consumed_char++;
2689     }
2690
2691  label_end_of_loop:
2692   coding->consumed = src_base - source;
2693   coding->produced = coding->produced_char = dst - destination;
2694 }
2695
2696 \f
2697 /*** 5. CCL handlers ***/
2698
2699 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2700    Check if a text is encoded in a coding system of which
2701    encoder/decoder are written in CCL program.  If it is, return
2702    CODING_CATEGORY_MASK_CCL, else return 0.  */
2703
2704 static int
2705 detect_coding_ccl (src, src_end, multibytep)
2706      unsigned char *src, *src_end;
2707      int multibytep;
2708 {
2709   unsigned char *valid;
2710   int c;
2711   /* Dummy for ONE_MORE_BYTE.  */
2712   struct coding_system dummy_coding;
2713   struct coding_system *coding = &dummy_coding;
2714
2715   /* No coding system is assigned to coding-category-ccl.  */
2716   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2717     return 0;
2718
2719   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2720   while (1)
2721     {
2722       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2723       if (! valid[c])
2724         return 0;
2725     }
2726  label_end_of_loop:
2727   return CODING_CATEGORY_MASK_CCL;
2728 }
2729
2730 \f
2731 /*** 6. End-of-line handlers ***/
2732
2733 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2734
2735 static void
2736 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2737      struct coding_system *coding;
2738      unsigned char *source, *destination;
2739      int src_bytes, dst_bytes;
2740 {
2741   unsigned char *src = source;
2742   unsigned char *dst = destination;
2743   unsigned char *src_end = src + src_bytes;
2744   unsigned char *dst_end = dst + dst_bytes;
2745   Lisp_Object translation_table;
2746   /* SRC_BASE remembers the start position in source in each loop.
2747      The loop will be exited when there's not enough source code
2748      (within macro ONE_MORE_BYTE), or when there's not enough
2749      destination area to produce a character (within macro
2750      EMIT_CHAR).  */
2751   unsigned char *src_base;
2752   int c;
2753
2754   translation_table = Qnil;
2755   switch (coding->eol_type)
2756     {
2757     case CODING_EOL_CRLF:
2758       while (1)
2759         {
2760           src_base = src;
2761           ONE_MORE_BYTE (c);
2762           if (c == '\r')
2763             {
2764               ONE_MORE_BYTE (c);
2765               if (c != '\n')
2766                 {
2767                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2768                     {
2769                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
2770                       goto label_end_of_loop;
2771                     }
2772                   src--;
2773                   c = '\r';
2774                 }
2775             }
2776           else if (c == '\n'
2777                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2778             {
2779               coding->result = CODING_FINISH_INCONSISTENT_EOL;
2780               goto label_end_of_loop;
2781             }
2782           EMIT_CHAR (c);
2783         }
2784       break;
2785
2786     case CODING_EOL_CR:
2787       while (1)
2788         {
2789           src_base = src;
2790           ONE_MORE_BYTE (c);
2791           if (c == '\n')
2792             {
2793               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2794                 {
2795                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2796                   goto label_end_of_loop;
2797                 }
2798             }
2799           else if (c == '\r')
2800             c = '\n';
2801           EMIT_CHAR (c);
2802         }
2803       break;
2804
2805     default:                    /* no need for EOL handling */
2806       while (1)
2807         {
2808           src_base = src;
2809           ONE_MORE_BYTE (c);
2810           EMIT_CHAR (c);
2811         }
2812     }
2813
2814  label_end_of_loop:
2815   coding->consumed = coding->consumed_char = src_base - source;
2816   coding->produced = dst - destination;
2817   return;
2818 }
2819
2820 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2821    format of end-of-line according to `coding->eol_type'.  It also
2822    convert multibyte form 8-bit characers to unibyte if
2823    CODING->src_multibyte is nonzero.  If `coding->mode &
2824    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2825    also means end-of-line.  */
2826
2827 static void
2828 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2829      struct coding_system *coding;
2830      unsigned char *source, *destination;
2831      int src_bytes, dst_bytes;
2832 {
2833   unsigned char *src = source;
2834   unsigned char *dst = destination;
2835   unsigned char *src_end = src + src_bytes;
2836   unsigned char *dst_end = dst + dst_bytes;
2837   Lisp_Object translation_table;
2838   /* SRC_BASE remembers the start position in source in each loop.
2839      The loop will be exited when there's not enough source text to
2840      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2841      there's not enough destination area to produce encoded codes
2842      (within macro EMIT_BYTES).  */
2843   unsigned char *src_base;
2844   int c;
2845   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2846
2847   translation_table = Qnil;
2848   if (coding->src_multibyte
2849       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2850     {
2851       src_end--;
2852       src_bytes--;
2853       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2854     }
2855
2856   if (coding->eol_type == CODING_EOL_CRLF)
2857     {
2858       while (src < src_end)
2859         {
2860           src_base = src;
2861           c = *src++;
2862           if (c >= 0x20)
2863             EMIT_ONE_BYTE (c);
2864           else if (c == '\n' || (c == '\r' && selective_display))
2865             EMIT_TWO_BYTES ('\r', '\n');
2866           else
2867             EMIT_ONE_BYTE (c);
2868         }
2869       src_base = src;
2870     label_end_of_loop:
2871       ;
2872     }
2873   else
2874     {
2875       if (!dst_bytes || src_bytes <= dst_bytes)
2876         {
2877           safe_bcopy (src, dst, src_bytes);
2878           src_base = src_end;
2879           dst += src_bytes;
2880         }
2881       else
2882         {
2883           if (coding->src_multibyte
2884               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2885             dst_bytes--;
2886           safe_bcopy (src, dst, dst_bytes);
2887           src_base = src + dst_bytes;
2888           dst = destination + dst_bytes;
2889           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2890         }
2891       if (coding->eol_type == CODING_EOL_CR)
2892         {
2893           for (src = destination; src < dst; src++)
2894             if (*src == '\n') *src = '\r';
2895         }
2896       else if (selective_display)
2897         {
2898           for (src = destination; src < dst; src++)
2899             if (*src == '\r') *src = '\n';
2900         }
2901     }
2902   if (coding->src_multibyte)
2903     dst = destination + str_as_unibyte (destination, dst - destination);
2904
2905   coding->consumed = src_base - source;
2906   coding->produced = dst - destination;
2907   coding->produced_char = coding->produced;
2908 }
2909
2910 \f
2911 /*** 7. C library functions ***/
2912
2913 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2914    has a property `coding-system'.  The value of this property is a
2915    vector of length 5 (called as coding-vector).  Among elements of
2916    this vector, the first (element[0]) and the fifth (element[4])
2917    carry important information for decoding/encoding.  Before
2918    decoding/encoding, this information should be set in fields of a
2919    structure of type `coding_system'.
2920
2921    A value of property `coding-system' can be a symbol of another
2922    subsidiary coding-system.  In that case, Emacs gets coding-vector
2923    from that symbol.
2924
2925    `element[0]' contains information to be set in `coding->type'.  The
2926    value and its meaning is as follows:
2927
2928    0 -- coding_type_emacs_mule
2929    1 -- coding_type_sjis
2930    2 -- coding_type_iso2022
2931    3 -- coding_type_big5
2932    4 -- coding_type_ccl encoder/decoder written in CCL
2933    nil -- coding_type_no_conversion
2934    t -- coding_type_undecided (automatic conversion on decoding,
2935                                no-conversion on encoding)
2936
2937    `element[4]' contains information to be set in `coding->flags' and
2938    `coding->spec'.  The meaning varies by `coding->type'.
2939
2940    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2941    of length 32 (of which the first 13 sub-elements are used now).
2942    Meanings of these sub-elements are:
2943
2944    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2945         If the value is an integer of valid charset, the charset is
2946         assumed to be designated to graphic register N initially.
2947
2948         If the value is minus, it is a minus value of charset which
2949         reserves graphic register N, which means that the charset is
2950         not designated initially but should be designated to graphic
2951         register N just before encoding a character in that charset.
2952
2953         If the value is nil, graphic register N is never used on
2954         encoding.
2955
2956    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2957         Each value takes t or nil.  See the section ISO2022 of
2958         `coding.h' for more information.
2959
2960    If `coding->type' is `coding_type_big5', element[4] is t to denote
2961    BIG5-ETen or nil to denote BIG5-HKU.
2962
2963    If `coding->type' takes the other value, element[4] is ignored.
2964
2965    Emacs Lisp's coding system also carries information about format of
2966    end-of-line in a value of property `eol-type'.  If the value is
2967    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2968    means CODING_EOL_CR.  If it is not integer, it should be a vector
2969    of subsidiary coding systems of which property `eol-type' has one
2970    of above values.
2971
2972 */
2973
2974 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2975    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2976    is setup so that no conversion is necessary and return -1, else
2977    return 0.  */
2978
2979 int
2980 setup_coding_system (coding_system, coding)
2981      Lisp_Object coding_system;
2982      struct coding_system *coding;
2983 {
2984   Lisp_Object coding_spec, coding_type, eol_type, plist;
2985   Lisp_Object val;
2986   int i;
2987
2988   /* At first, zero clear all members.  */
2989   bzero (coding, sizeof (struct coding_system));
2990
2991   /* Initialize some fields required for all kinds of coding systems.  */
2992   coding->symbol = coding_system;
2993   coding->heading_ascii = -1;
2994   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2995   coding->composing = COMPOSITION_DISABLED;
2996   coding->cmp_data = NULL;
2997
2998   if (NILP (coding_system))
2999     goto label_invalid_coding_system;
3000
3001   coding_spec = Fget (coding_system, Qcoding_system);
3002
3003   if (!VECTORP (coding_spec)
3004       || XVECTOR (coding_spec)->size != 5
3005       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3006     goto label_invalid_coding_system;
3007
3008   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3009   if (VECTORP (eol_type))
3010     {
3011       coding->eol_type = CODING_EOL_UNDECIDED;
3012       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3013     }
3014   else if (XFASTINT (eol_type) == 1)
3015     {
3016       coding->eol_type = CODING_EOL_CRLF;
3017       coding->common_flags
3018         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3019     }
3020   else if (XFASTINT (eol_type) == 2)
3021     {
3022       coding->eol_type = CODING_EOL_CR;
3023       coding->common_flags
3024         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3025     }
3026   else
3027     coding->eol_type = CODING_EOL_LF;
3028
3029   coding_type = XVECTOR (coding_spec)->contents[0];
3030   /* Try short cut.  */
3031   if (SYMBOLP (coding_type))
3032     {
3033       if (EQ (coding_type, Qt))
3034         {
3035           coding->type = coding_type_undecided;
3036           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3037         }
3038       else
3039         coding->type = coding_type_no_conversion;
3040       /* Initialize this member.  Any thing other than
3041          CODING_CATEGORY_IDX_UTF_16_BE and
3042          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3043          special treatment in detect_eol.  */
3044       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3045
3046       return 0;
3047     }
3048
3049   /* Get values of coding system properties:
3050      `post-read-conversion', `pre-write-conversion',
3051      `translation-table-for-decode', `translation-table-for-encode'.  */
3052   plist = XVECTOR (coding_spec)->contents[3];
3053   /* Pre & post conversion functions should be disabled if
3054      inhibit_eol_conversion is nozero.  This is the case that a code
3055      conversion function is called while those functions are running.  */
3056   if (! inhibit_pre_post_conversion)
3057     {
3058       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3059       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3060     }
3061   val = Fplist_get (plist, Qtranslation_table_for_decode);
3062   if (SYMBOLP (val))
3063     val = Fget (val, Qtranslation_table_for_decode);
3064   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3065   val = Fplist_get (plist, Qtranslation_table_for_encode);
3066   if (SYMBOLP (val))
3067     val = Fget (val, Qtranslation_table_for_encode);
3068   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3069   val = Fplist_get (plist, Qcoding_category);
3070   if (!NILP (val))
3071     {
3072       val = Fget (val, Qcoding_category_index);
3073       if (INTEGERP (val))
3074         coding->category_idx = XINT (val);
3075       else
3076         goto label_invalid_coding_system;
3077     }
3078   else
3079     goto label_invalid_coding_system;
3080
3081   /* If the coding system has non-nil `composition' property, enable
3082      composition handling.  */
3083   val = Fplist_get (plist, Qcomposition);
3084   if (!NILP (val))
3085     coding->composing = COMPOSITION_NO;
3086
3087   switch (XFASTINT (coding_type))
3088     {
3089     case 0:
3090       coding->type = coding_type_emacs_mule;
3091       if (!NILP (coding->post_read_conversion))
3092         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3093       if (!NILP (coding->pre_write_conversion))
3094         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3095       break;
3096
3097     case 1:
3098       coding->type = coding_type_sjis;
3099       coding->common_flags
3100         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3101       break;
3102
3103     case 2:
3104       coding->type = coding_type_iso2022;
3105       coding->common_flags
3106         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3107       {
3108         Lisp_Object val, temp;
3109         Lisp_Object *flags;
3110         int i, charset, reg_bits = 0;
3111
3112         val = XVECTOR (coding_spec)->contents[4];
3113
3114         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3115           goto label_invalid_coding_system;
3116
3117         flags = XVECTOR (val)->contents;
3118         coding->flags
3119           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3120              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3121              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3122              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3123              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3124              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3125              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3126              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3127              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3128              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3129              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3130              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3131              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3132              );
3133
3134         /* Invoke graphic register 0 to plane 0.  */
3135         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3136         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3137         CODING_SPEC_ISO_INVOCATION (coding, 1)
3138           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3139         /* Not single shifting at first.  */
3140         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3141         /* Beginning of buffer should also be regarded as bol. */
3142         CODING_SPEC_ISO_BOL (coding) = 1;
3143
3144         for (charset = 0; charset <= MAX_CHARSET; charset++)
3145           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3146         val = Vcharset_revision_alist;
3147         while (CONSP (val))
3148           {
3149             charset = get_charset_id (Fcar_safe (XCAR (val)));
3150             if (charset >= 0
3151                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3152                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3153               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3154             val = XCDR (val);
3155           }
3156
3157         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3158            FLAGS[REG] can be one of below:
3159                 integer CHARSET: CHARSET occupies register I,
3160                 t: designate nothing to REG initially, but can be used
3161                   by any charsets,
3162                 list of integer, nil, or t: designate the first
3163                   element (if integer) to REG initially, the remaining
3164                   elements (if integer) is designated to REG on request,
3165                   if an element is t, REG can be used by any charsets,
3166                 nil: REG is never used.  */
3167         for (charset = 0; charset <= MAX_CHARSET; charset++)
3168           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3169             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3170         for (i = 0; i < 4; i++)
3171           {
3172             if (INTEGERP (flags[i])
3173                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3174                 || (charset = get_charset_id (flags[i])) >= 0)
3175               {
3176                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3177                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3178               }
3179             else if (EQ (flags[i], Qt))
3180               {
3181                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3182                 reg_bits |= 1 << i;
3183                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3184               }
3185             else if (CONSP (flags[i]))
3186               {
3187                 Lisp_Object tail;
3188                 tail = flags[i];
3189
3190                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3191                 if (INTEGERP (XCAR (tail))
3192                     && (charset = XINT (XCAR (tail)),
3193                         CHARSET_VALID_P (charset))
3194                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3195                   {
3196                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3197                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3198                   }
3199                 else
3200                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3201                 tail = XCDR (tail);
3202                 while (CONSP (tail))
3203                   {
3204                     if (INTEGERP (XCAR (tail))
3205                         && (charset = XINT (XCAR (tail)),
3206                             CHARSET_VALID_P (charset))
3207                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3208                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3209                         = i;
3210                     else if (EQ (XCAR (tail), Qt))
3211                       reg_bits |= 1 << i;
3212                     tail = XCDR (tail);
3213                   }
3214               }
3215             else
3216               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3217
3218             CODING_SPEC_ISO_DESIGNATION (coding, i)
3219               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3220           }
3221
3222         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3223           {
3224             /* REG 1 can be used only by locking shift in 7-bit env.  */
3225             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3226               reg_bits &= ~2;
3227             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3228               /* Without any shifting, only REG 0 and 1 can be used.  */
3229               reg_bits &= 3;
3230           }
3231
3232         if (reg_bits)
3233           for (charset = 0; charset <= MAX_CHARSET; charset++)
3234             {
3235               if (CHARSET_VALID_P (charset)
3236                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3237                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3238                 {
3239                   /* There exist some default graphic registers to be
3240                      used by CHARSET.  */
3241
3242                   /* We had better avoid designating a charset of
3243                      CHARS96 to REG 0 as far as possible.  */
3244                   if (CHARSET_CHARS (charset) == 96)
3245                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3246                       = (reg_bits & 2
3247                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3248                   else
3249                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3250                       = (reg_bits & 1
3251                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3252                 }
3253             }
3254       }
3255       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3256       coding->spec.iso2022.last_invalid_designation_register = -1;
3257       break;
3258
3259     case 3:
3260       coding->type = coding_type_big5;
3261       coding->common_flags
3262         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3263       coding->flags
3264         = (NILP (XVECTOR (coding_spec)->contents[4])
3265            ? CODING_FLAG_BIG5_HKU
3266            : CODING_FLAG_BIG5_ETEN);
3267       break;
3268
3269     case 4:
3270       coding->type = coding_type_ccl;
3271       coding->common_flags
3272         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3273       {
3274         val = XVECTOR (coding_spec)->contents[4];
3275         if (! CONSP (val)
3276             || setup_ccl_program (&(coding->spec.ccl.decoder),
3277                                   XCAR (val)) < 0
3278             || setup_ccl_program (&(coding->spec.ccl.encoder),
3279                                   XCDR (val)) < 0)
3280           goto label_invalid_coding_system;
3281
3282         bzero (coding->spec.ccl.valid_codes, 256);
3283         val = Fplist_get (plist, Qvalid_codes);
3284         if (CONSP (val))
3285           {
3286             Lisp_Object this;
3287
3288             for (; CONSP (val); val = XCDR (val))
3289               {
3290                 this = XCAR (val);
3291                 if (INTEGERP (this)
3292                     && XINT (this) >= 0 && XINT (this) < 256)
3293                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3294                 else if (CONSP (this)
3295                          && INTEGERP (XCAR (this))
3296                          && INTEGERP (XCDR (this)))
3297                   {
3298                     int start = XINT (XCAR (this));
3299                     int end = XINT (XCDR (this));
3300
3301                     if (start >= 0 && start <= end && end < 256)
3302                       while (start <= end)
3303                         coding->spec.ccl.valid_codes[start++] = 1;
3304                   }
3305               }
3306           }
3307       }
3308       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3309       coding->spec.ccl.cr_carryover = 0;
3310       break;
3311
3312     case 5:
3313       coding->type = coding_type_raw_text;
3314       break;
3315
3316     default:
3317       goto label_invalid_coding_system;
3318     }
3319   return 0;
3320
3321  label_invalid_coding_system:
3322   coding->type = coding_type_no_conversion;
3323   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3324   coding->common_flags = 0;
3325   coding->eol_type = CODING_EOL_LF;
3326   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3327   return -1;
3328 }
3329
3330 /* Free memory blocks allocated for storing composition information.  */
3331
3332 void
3333 coding_free_composition_data (coding)
3334      struct coding_system *coding;
3335 {
3336   struct composition_data *cmp_data = coding->cmp_data, *next;
3337
3338   if (!cmp_data)
3339     return;
3340   /* Memory blocks are chained.  At first, rewind to the first, then,
3341      free blocks one by one.  */
3342   while (cmp_data->prev)
3343     cmp_data = cmp_data->prev;
3344   while (cmp_data)
3345     {
3346       next = cmp_data->next;
3347       xfree (cmp_data);
3348       cmp_data = next;
3349     }
3350   coding->cmp_data = NULL;
3351 }
3352
3353 /* Set `char_offset' member of all memory blocks pointed by
3354    coding->cmp_data to POS.  */
3355
3356 void
3357 coding_adjust_composition_offset (coding, pos)
3358      struct coding_system *coding;
3359      int pos;
3360 {
3361   struct composition_data *cmp_data;
3362
3363   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3364     cmp_data->char_offset = pos;
3365 }
3366
3367 /* Setup raw-text or one of its subsidiaries in the structure
3368    coding_system CODING according to the already setup value eol_type
3369    in CODING.  CODING should be setup for some coding system in
3370    advance.  */
3371
3372 void
3373 setup_raw_text_coding_system (coding)
3374      struct coding_system *coding;
3375 {
3376   if (coding->type != coding_type_raw_text)
3377     {
3378       coding->symbol = Qraw_text;
3379       coding->type = coding_type_raw_text;
3380       if (coding->eol_type != CODING_EOL_UNDECIDED)
3381         {
3382           Lisp_Object subsidiaries;
3383           subsidiaries = Fget (Qraw_text, Qeol_type);
3384
3385           if (VECTORP (subsidiaries)
3386               && XVECTOR (subsidiaries)->size == 3)
3387             coding->symbol
3388               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3389         }
3390       setup_coding_system (coding->symbol, coding);
3391     }
3392   return;
3393 }
3394
3395 /* Emacs has a mechanism to automatically detect a coding system if it
3396    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3397    it's impossible to distinguish some coding systems accurately
3398    because they use the same range of codes.  So, at first, coding
3399    systems are categorized into 7, those are:
3400
3401    o coding-category-emacs-mule
3402
3403         The category for a coding system which has the same code range
3404         as Emacs' internal format.  Assigned the coding-system (Lisp
3405         symbol) `emacs-mule' by default.
3406
3407    o coding-category-sjis
3408
3409         The category for a coding system which has the same code range
3410         as SJIS.  Assigned the coding-system (Lisp
3411         symbol) `japanese-shift-jis' by default.
3412
3413    o coding-category-iso-7
3414
3415         The category for a coding system which has the same code range
3416         as ISO2022 of 7-bit environment.  This doesn't use any locking
3417         shift and single shift functions.  This can encode/decode all
3418         charsets.  Assigned the coding-system (Lisp symbol)
3419         `iso-2022-7bit' by default.
3420
3421    o coding-category-iso-7-tight
3422
3423         Same as coding-category-iso-7 except that this can
3424         encode/decode only the specified charsets.
3425
3426    o coding-category-iso-8-1
3427
3428         The category for a coding system which has the same code range
3429         as ISO2022 of 8-bit environment and graphic plane 1 used only
3430         for DIMENSION1 charset.  This doesn't use any locking shift
3431         and single shift functions.  Assigned the coding-system (Lisp
3432         symbol) `iso-latin-1' by default.
3433
3434    o coding-category-iso-8-2
3435
3436         The category for a coding system which has the same code range
3437         as ISO2022 of 8-bit environment and graphic plane 1 used only
3438         for DIMENSION2 charset.  This doesn't use any locking shift
3439         and single shift functions.  Assigned the coding-system (Lisp
3440         symbol) `japanese-iso-8bit' by default.
3441
3442    o coding-category-iso-7-else
3443
3444         The category for a coding system which has the same code range
3445         as ISO2022 of 7-bit environemnt but uses locking shift or
3446         single shift functions.  Assigned the coding-system (Lisp
3447         symbol) `iso-2022-7bit-lock' by default.
3448
3449    o coding-category-iso-8-else
3450
3451         The category for a coding system which has the same code range
3452         as ISO2022 of 8-bit environemnt but uses locking shift or
3453         single shift functions.  Assigned the coding-system (Lisp
3454         symbol) `iso-2022-8bit-ss2' by default.
3455
3456    o coding-category-big5
3457
3458         The category for a coding system which has the same code range
3459         as BIG5.  Assigned the coding-system (Lisp symbol)
3460         `cn-big5' by default.
3461
3462    o coding-category-utf-8
3463
3464         The category for a coding system which has the same code range
3465         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3466         symbol) `utf-8' by default.
3467
3468    o coding-category-utf-16-be
3469
3470         The category for a coding system in which a text has an
3471         Unicode signature (cf. Unicode Standard) in the order of BIG
3472         endian at the head.  Assigned the coding-system (Lisp symbol)
3473         `utf-16-be' by default.
3474
3475    o coding-category-utf-16-le
3476
3477         The category for a coding system in which a text has an
3478         Unicode signature (cf. Unicode Standard) in the order of
3479         LITTLE endian at the head.  Assigned the coding-system (Lisp
3480         symbol) `utf-16-le' by default.
3481
3482    o coding-category-ccl
3483
3484         The category for a coding system of which encoder/decoder is
3485         written in CCL programs.  The default value is nil, i.e., no
3486         coding system is assigned.
3487
3488    o coding-category-binary
3489
3490         The category for a coding system not categorized in any of the
3491         above.  Assigned the coding-system (Lisp symbol)
3492         `no-conversion' by default.
3493
3494    Each of them is a Lisp symbol and the value is an actual
3495    `coding-system's (this is also a Lisp symbol) assigned by a user.
3496    What Emacs does actually is to detect a category of coding system.
3497    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3498    decide only one possible category, it selects a category of the
3499    highest priority.  Priorities of categories are also specified by a
3500    user in a Lisp variable `coding-category-list'.
3501
3502 */
3503
3504 static
3505 int ascii_skip_code[256];
3506
3507 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3508    If it detects possible coding systems, return an integer in which
3509    appropriate flag bits are set.  Flag bits are defined by macros
3510    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3511    it should point the table `coding_priorities'.  In that case, only
3512    the flag bit for a coding system of the highest priority is set in
3513    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
3514    range 0x80..0x9F are in multibyte form.
3515
3516    How many ASCII characters are at the head is returned as *SKIP.  */
3517
3518 static int
3519 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
3520      unsigned char *source;
3521      int src_bytes, *priorities, *skip;
3522      int multibytep;
3523 {
3524   register unsigned char c;
3525   unsigned char *src = source, *src_end = source + src_bytes;
3526   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3527   int i, idx;
3528
3529   /* At first, skip all ASCII characters and control characters except
3530      for three ISO2022 specific control characters.  */
3531   ascii_skip_code[ISO_CODE_SO] = 0;
3532   ascii_skip_code[ISO_CODE_SI] = 0;
3533   ascii_skip_code[ISO_CODE_ESC] = 0;
3534
3535  label_loop_detect_coding:
3536   while (src < src_end && ascii_skip_code[*src]) src++;
3537   *skip = src - source;
3538
3539   if (src >= src_end)
3540     /* We found nothing other than ASCII.  There's nothing to do.  */
3541     return 0;
3542
3543   c = *src;
3544   /* The text seems to be encoded in some multilingual coding system.
3545      Now, try to find in which coding system the text is encoded.  */
3546   if (c < 0x80)
3547     {
3548       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3549       /* C is an ISO2022 specific control code of C0.  */
3550       mask = detect_coding_iso2022 (src, src_end, multibytep);
3551       if (mask == 0)
3552         {
3553           /* No valid ISO2022 code follows C.  Try again.  */
3554           src++;
3555           if (c == ISO_CODE_ESC)
3556             ascii_skip_code[ISO_CODE_ESC] = 1;
3557           else
3558             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3559           goto label_loop_detect_coding;
3560         }
3561       if (priorities)
3562         {
3563           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3564             {
3565               if (mask & priorities[i])
3566                 return priorities[i];
3567             }
3568           return CODING_CATEGORY_MASK_RAW_TEXT;
3569         }
3570     }
3571   else
3572     {
3573       int try;
3574
3575       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
3576         c = *src++ - 0x20;
3577
3578       if (c < 0xA0)
3579         {
3580           /* C is the first byte of SJIS character code,
3581              or a leading-code of Emacs' internal format (emacs-mule),
3582              or the first byte of UTF-16.  */
3583           try = (CODING_CATEGORY_MASK_SJIS
3584                   | CODING_CATEGORY_MASK_EMACS_MULE
3585                   | CODING_CATEGORY_MASK_UTF_16_BE
3586                   | CODING_CATEGORY_MASK_UTF_16_LE);
3587
3588           /* Or, if C is a special latin extra code,
3589              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3590              or is an ISO2022 control-sequence-introducer (CSI),
3591              we should also consider the possibility of ISO2022 codings.  */
3592           if ((VECTORP (Vlatin_extra_code_table)
3593                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3594               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3595               || (c == ISO_CODE_CSI
3596                   && (src < src_end
3597                       && (*src == ']'
3598                           || ((*src == '0' || *src == '1' || *src == '2')
3599                               && src + 1 < src_end
3600                               && src[1] == ']')))))
3601             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3602                      | CODING_CATEGORY_MASK_ISO_8BIT);
3603         }
3604       else
3605         /* C is a character of ISO2022 in graphic plane right,
3606            or a SJIS's 1-byte character code (i.e. JISX0201),
3607            or the first byte of BIG5's 2-byte code,
3608            or the first byte of UTF-8/16.  */
3609         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3610                 | CODING_CATEGORY_MASK_ISO_8BIT
3611                 | CODING_CATEGORY_MASK_SJIS
3612                 | CODING_CATEGORY_MASK_BIG5
3613                 | CODING_CATEGORY_MASK_UTF_8
3614                 | CODING_CATEGORY_MASK_UTF_16_BE
3615                 | CODING_CATEGORY_MASK_UTF_16_LE);
3616
3617       /* Or, we may have to consider the possibility of CCL.  */
3618       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3619           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3620               ->spec.ccl.valid_codes)[c])
3621         try |= CODING_CATEGORY_MASK_CCL;
3622
3623       mask = 0;
3624       utf16_examined_p = iso2022_examined_p = 0;
3625       if (priorities)
3626         {
3627           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3628             {
3629               if (!iso2022_examined_p
3630                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3631                 {
3632                   mask |= detect_coding_iso2022 (src, src_end);
3633                   iso2022_examined_p = 1;
3634                 }
3635               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3636                 mask |= detect_coding_sjis (src, src_end, multibytep);
3637               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3638                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
3639               else if (!utf16_examined_p
3640                        && (priorities[i] & try &
3641                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3642                 {
3643                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
3644                   utf16_examined_p = 1;
3645                 }
3646               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3647                 mask |= detect_coding_big5 (src, src_end, multibytep);
3648               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3649                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
3650               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3651                 mask |= detect_coding_ccl (src, src_end, multibytep);
3652               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3653                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3654               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3655                 mask |= CODING_CATEGORY_MASK_BINARY;
3656               if (mask & priorities[i])
3657                 return priorities[i];
3658             }
3659           return CODING_CATEGORY_MASK_RAW_TEXT;
3660         }
3661       if (try & CODING_CATEGORY_MASK_ISO)
3662         mask |= detect_coding_iso2022 (src, src_end, multibytep);
3663       if (try & CODING_CATEGORY_MASK_SJIS)
3664         mask |= detect_coding_sjis (src, src_end, multibytep);
3665       if (try & CODING_CATEGORY_MASK_BIG5)
3666         mask |= detect_coding_big5 (src, src_end, multibytep);
3667       if (try & CODING_CATEGORY_MASK_UTF_8)
3668         mask |= detect_coding_utf_8 (src, src_end, multibytep);
3669       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3670         mask |= detect_coding_utf_16 (src, src_end, multibytep);
3671       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3672         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
3673       if (try & CODING_CATEGORY_MASK_CCL)
3674         mask |= detect_coding_ccl (src, src_end, multibytep);
3675     }
3676   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3677 }
3678
3679 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3680    The information of the detected coding system is set in CODING.  */
3681
3682 void
3683 detect_coding (coding, src, src_bytes)
3684      struct coding_system *coding;
3685      unsigned char *src;
3686      int src_bytes;
3687 {
3688   unsigned int idx;
3689   int skip, mask, i;
3690   Lisp_Object val;
3691
3692   val = Vcoding_category_list;
3693   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
3694                              coding->src_multibyte);
3695   coding->heading_ascii = skip;
3696
3697   if (!mask) return;
3698
3699   /* We found a single coding system of the highest priority in MASK.  */
3700   idx = 0;
3701   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3702   if (! mask)
3703     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3704
3705   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3706
3707   if (coding->eol_type != CODING_EOL_UNDECIDED)
3708     {
3709       Lisp_Object tmp;
3710
3711       tmp = Fget (val, Qeol_type);
3712       if (VECTORP (tmp))
3713         val = XVECTOR (tmp)->contents[coding->eol_type];
3714     }
3715
3716   /* Setup this new coding system while preserving some slots.  */
3717   {
3718     int src_multibyte = coding->src_multibyte;
3719     int dst_multibyte = coding->dst_multibyte;
3720
3721     setup_coding_system (val, coding);
3722     coding->src_multibyte = src_multibyte;
3723     coding->dst_multibyte = dst_multibyte;
3724     coding->heading_ascii = skip;
3725   }
3726 }
3727
3728 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3729    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3730    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3731
3732    How many non-eol characters are at the head is returned as *SKIP.  */
3733
3734 #define MAX_EOL_CHECK_COUNT 3
3735
3736 static int
3737 detect_eol_type (source, src_bytes, skip)
3738      unsigned char *source;
3739      int src_bytes, *skip;
3740 {
3741   unsigned char *src = source, *src_end = src + src_bytes;
3742   unsigned char c;
3743   int total = 0;                /* How many end-of-lines are found so far.  */
3744   int eol_type = CODING_EOL_UNDECIDED;
3745   int this_eol_type;
3746
3747   *skip = 0;
3748
3749   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3750     {
3751       c = *src++;
3752       if (c == '\n' || c == '\r')
3753         {
3754           if (*skip == 0)
3755             *skip = src - 1 - source;
3756           total++;
3757           if (c == '\n')
3758             this_eol_type = CODING_EOL_LF;
3759           else if (src >= src_end || *src != '\n')
3760             this_eol_type = CODING_EOL_CR;
3761           else
3762             this_eol_type = CODING_EOL_CRLF, src++;
3763
3764           if (eol_type == CODING_EOL_UNDECIDED)
3765             /* This is the first end-of-line.  */
3766             eol_type = this_eol_type;
3767           else if (eol_type != this_eol_type)
3768             {
3769               /* The found type is different from what found before.  */
3770               eol_type = CODING_EOL_INCONSISTENT;
3771               break;
3772             }
3773         }
3774     }
3775
3776   if (*skip == 0)
3777     *skip = src_end - source;
3778   return eol_type;
3779 }
3780
3781 /* Like detect_eol_type, but detect EOL type in 2-octet
3782    big-endian/little-endian format for coding systems utf-16-be and
3783    utf-16-le.  */
3784
3785 static int
3786 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3787      unsigned char *source;
3788      int src_bytes, *skip;
3789 {
3790   unsigned char *src = source, *src_end = src + src_bytes;
3791   unsigned int c1, c2;
3792   int total = 0;                /* How many end-of-lines are found so far.  */
3793   int eol_type = CODING_EOL_UNDECIDED;
3794   int this_eol_type;
3795   int msb, lsb;
3796
3797   if (big_endian_p)
3798     msb = 0, lsb = 1;
3799   else
3800     msb = 1, lsb = 0;
3801
3802   *skip = 0;
3803
3804   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3805     {
3806       c1 = (src[msb] << 8) | (src[lsb]);
3807       src += 2;
3808
3809       if (c1 == '\n' || c1 == '\r')
3810         {
3811           if (*skip == 0)
3812             *skip = src - 2 - source;
3813           total++;
3814           if (c1 == '\n')
3815             {
3816               this_eol_type = CODING_EOL_LF;
3817             }
3818           else
3819             {
3820               if ((src + 1) >= src_end)
3821                 {
3822                   this_eol_type = CODING_EOL_CR;
3823                 }
3824               else
3825                 {
3826                   c2 = (src[msb] << 8) | (src[lsb]);
3827                   if (c2 == '\n')
3828                     this_eol_type = CODING_EOL_CRLF, src += 2;
3829                   else
3830                     this_eol_type = CODING_EOL_CR;
3831                 }
3832             }
3833
3834           if (eol_type == CODING_EOL_UNDECIDED)
3835             /* This is the first end-of-line.  */
3836             eol_type = this_eol_type;
3837           else if (eol_type != this_eol_type)
3838             {
3839               /* The found type is different from what found before.  */
3840               eol_type = CODING_EOL_INCONSISTENT;
3841               break;
3842             }
3843         }
3844     }
3845
3846   if (*skip == 0)
3847     *skip = src_end - source;
3848   return eol_type;
3849 }
3850
3851 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3852    is encoded.  If it detects an appropriate format of end-of-line, it
3853    sets the information in *CODING.  */
3854
3855 void
3856 detect_eol (coding, src, src_bytes)
3857      struct coding_system *coding;
3858      unsigned char *src;
3859      int src_bytes;
3860 {
3861   Lisp_Object val;
3862   int skip;
3863   int eol_type;
3864
3865   switch (coding->category_idx)
3866     {
3867     case CODING_CATEGORY_IDX_UTF_16_BE:
3868       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3869       break;
3870     case CODING_CATEGORY_IDX_UTF_16_LE:
3871       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3872       break;
3873     default:
3874       eol_type = detect_eol_type (src, src_bytes, &skip);
3875       break;
3876     }
3877
3878   if (coding->heading_ascii > skip)
3879     coding->heading_ascii = skip;
3880   else
3881     skip = coding->heading_ascii;
3882
3883   if (eol_type == CODING_EOL_UNDECIDED)
3884     return;
3885   if (eol_type == CODING_EOL_INCONSISTENT)
3886     {
3887 #if 0
3888       /* This code is suppressed until we find a better way to
3889          distinguish raw text file and binary file.  */
3890
3891       /* If we have already detected that the coding is raw-text, the
3892          coding should actually be no-conversion.  */
3893       if (coding->type == coding_type_raw_text)
3894         {
3895           setup_coding_system (Qno_conversion, coding);
3896           return;
3897         }
3898       /* Else, let's decode only text code anyway.  */
3899 #endif /* 0 */
3900       eol_type = CODING_EOL_LF;
3901     }
3902
3903   val = Fget (coding->symbol, Qeol_type);
3904   if (VECTORP (val) && XVECTOR (val)->size == 3)
3905     {
3906       int src_multibyte = coding->src_multibyte;
3907       int dst_multibyte = coding->dst_multibyte;
3908
3909       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3910       coding->src_multibyte = src_multibyte;
3911       coding->dst_multibyte = dst_multibyte;
3912       coding->heading_ascii = skip;
3913     }
3914 }
3915
3916 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3917
3918 #define DECODING_BUFFER_MAG(coding)                     \
3919   (coding->type == coding_type_iso2022                  \
3920    ? 3                                                  \
3921    : (coding->type == coding_type_ccl                   \
3922       ? coding->spec.ccl.decoder.buf_magnification      \
3923       : 2))
3924
3925 /* Return maximum size (bytes) of a buffer enough for decoding
3926    SRC_BYTES of text encoded in CODING.  */
3927
3928 int
3929 decoding_buffer_size (coding, src_bytes)
3930      struct coding_system *coding;
3931      int src_bytes;
3932 {
3933   return (src_bytes * DECODING_BUFFER_MAG (coding)
3934           + CONVERSION_BUFFER_EXTRA_ROOM);
3935 }
3936
3937 /* Return maximum size (bytes) of a buffer enough for encoding
3938    SRC_BYTES of text to CODING.  */
3939
3940 int
3941 encoding_buffer_size (coding, src_bytes)
3942      struct coding_system *coding;
3943      int src_bytes;
3944 {
3945   int magnification;
3946
3947   if (coding->type == coding_type_ccl)
3948     magnification = coding->spec.ccl.encoder.buf_magnification;
3949   else if (CODING_REQUIRE_ENCODING (coding))
3950     magnification = 3;
3951   else
3952     magnification = 1;
3953
3954   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3955 }
3956
3957 /* Working buffer for code conversion.  */
3958 struct conversion_buffer
3959 {
3960   int size;                     /* size of data.  */
3961   int on_stack;                 /* 1 if allocated by alloca.  */
3962   unsigned char *data;
3963 };
3964
3965 /* Don't use alloca for allocating memory space larger than this, lest
3966    we overflow their stack.  */
3967 #define MAX_ALLOCA 16*1024
3968
3969 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
3970 #define allocate_conversion_buffer(buf, len)            \
3971   do {                                                  \
3972     if (len < MAX_ALLOCA)                               \
3973       {                                                 \
3974         buf.data = (unsigned char *) alloca (len);      \
3975         buf.on_stack = 1;                               \
3976       }                                                 \
3977     else                                                \
3978       {                                                 \
3979         buf.data = (unsigned char *) xmalloc (len);     \
3980         buf.on_stack = 0;                               \
3981       }                                                 \
3982     buf.size = len;                                     \
3983   } while (0)
3984
3985 /* Double the allocated memory for *BUF.  */
3986 static void
3987 extend_conversion_buffer (buf)
3988      struct conversion_buffer *buf;
3989 {
3990   if (buf->on_stack)
3991     {
3992       unsigned char *save = buf->data;
3993       buf->data = (unsigned char *) xmalloc (buf->size * 2);
3994       bcopy (save, buf->data, buf->size);
3995       buf->on_stack = 0;
3996     }
3997   else
3998     {
3999       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4000     }
4001   buf->size *= 2;
4002 }
4003
4004 /* Free the allocated memory for BUF if it is not on stack.  */
4005 static void
4006 free_conversion_buffer (buf)
4007      struct conversion_buffer *buf;
4008 {
4009   if (!buf->on_stack)
4010     xfree (buf->data);
4011 }
4012
4013 int
4014 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4015      struct coding_system *coding;
4016      unsigned char *source, *destination;
4017      int src_bytes, dst_bytes, encodep;
4018 {
4019   struct ccl_program *ccl
4020     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4021   int result;
4022
4023   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4024   if (encodep)
4025     ccl->eol_type = coding->eol_type;
4026   ccl->multibyte = coding->src_multibyte;
4027   coding->produced = ccl_driver (ccl, source, destination,
4028                                  src_bytes, dst_bytes, &(coding->consumed));
4029   if (encodep)
4030     coding->produced_char = coding->produced;
4031   else
4032     {
4033       int bytes
4034         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4035       coding->produced = str_as_multibyte (destination, bytes,
4036                                            coding->produced,
4037                                            &(coding->produced_char));
4038     }
4039
4040   switch (ccl->status)
4041     {
4042     case CCL_STAT_SUSPEND_BY_SRC:
4043       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4044       break;
4045     case CCL_STAT_SUSPEND_BY_DST:
4046       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4047       break;
4048     case CCL_STAT_QUIT:
4049     case CCL_STAT_INVALID_CMD:
4050       coding->result = CODING_FINISH_INTERRUPT;
4051       break;
4052     default:
4053       coding->result = CODING_FINISH_NORMAL;
4054       break;
4055     }
4056   return coding->result;
4057 }
4058
4059 /* Decode EOL format of the text at PTR of BYTES length destructively
4060    according to CODING->eol_type.  This is called after the CCL
4061    program produced a decoded text at PTR.  If we do CRLF->LF
4062    conversion, update CODING->produced and CODING->produced_char.  */
4063
4064 static void
4065 decode_eol_post_ccl (coding, ptr, bytes)
4066      struct coding_system *coding;
4067      unsigned char *ptr;
4068      int bytes;
4069 {
4070   Lisp_Object val, saved_coding_symbol;
4071   unsigned char *pend = ptr + bytes;
4072   int dummy;
4073
4074   /* Remember the current coding system symbol.  We set it back when
4075      an inconsistent EOL is found so that `last-coding-system-used' is
4076      set to the coding system that doesn't specify EOL conversion.  */
4077   saved_coding_symbol = coding->symbol;
4078
4079   coding->spec.ccl.cr_carryover = 0;
4080   if (coding->eol_type == CODING_EOL_UNDECIDED)
4081     {
4082       /* Here, to avoid the call of setup_coding_system, we directly
4083          call detect_eol_type.  */
4084       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4085       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4086         coding->eol_type = CODING_EOL_LF;
4087       if (coding->eol_type != CODING_EOL_UNDECIDED)
4088         {
4089           val = Fget (coding->symbol, Qeol_type);
4090           if (VECTORP (val) && XVECTOR (val)->size == 3)
4091             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4092         }
4093       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4094     }
4095
4096   if (coding->eol_type == CODING_EOL_LF
4097       || coding->eol_type == CODING_EOL_UNDECIDED)
4098     {
4099       /* We have nothing to do.  */
4100       ptr = pend;
4101     }
4102   else if (coding->eol_type == CODING_EOL_CRLF)
4103     {
4104       unsigned char *pstart = ptr, *p = ptr;
4105
4106       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4107           && *(pend - 1) == '\r')
4108         {
4109           /* If the last character is CR, we can't handle it here
4110              because LF will be in the not-yet-decoded source text.
4111              Recorded that the CR is not yet processed.  */
4112           coding->spec.ccl.cr_carryover = 1;
4113           coding->produced--;
4114           coding->produced_char--;
4115           pend--;
4116         }
4117       while (ptr < pend)
4118         {
4119           if (*ptr == '\r')
4120             {
4121               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4122                 {
4123                   *p++ = '\n';
4124                   ptr += 2;
4125                 }
4126               else
4127                 {
4128                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4129                     goto undo_eol_conversion;
4130                   *p++ = *ptr++;
4131                 }
4132             }
4133           else if (*ptr == '\n'
4134                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4135             goto undo_eol_conversion;
4136           else
4137             *p++ = *ptr++;
4138           continue;
4139
4140         undo_eol_conversion:
4141           /* We have faced with inconsistent EOL format at PTR.
4142              Convert all LFs before PTR back to CRLFs.  */
4143           for (p--, ptr--; p >= pstart; p--)
4144             {
4145               if (*p == '\n')
4146                 *ptr-- = '\n', *ptr-- = '\r';
4147               else
4148                 *ptr-- = *p;
4149             }
4150           /*  If carryover is recorded, cancel it because we don't
4151               convert CRLF anymore.  */
4152           if (coding->spec.ccl.cr_carryover)
4153             {
4154               coding->spec.ccl.cr_carryover = 0;
4155               coding->produced++;
4156               coding->produced_char++;
4157               pend++;
4158             }
4159           p = ptr = pend;
4160           coding->eol_type = CODING_EOL_LF;
4161           coding->symbol = saved_coding_symbol;
4162         }
4163       if (p < pend)
4164         {
4165           /* As each two-byte sequence CRLF was converted to LF, (PEND
4166              - P) is the number of deleted characters.  */
4167           coding->produced -= pend - p;
4168           coding->produced_char -= pend - p;
4169         }
4170     }
4171   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4172     {
4173       unsigned char *p = ptr;
4174
4175       for (; ptr < pend; ptr++)
4176         {
4177           if (*ptr == '\r')
4178             *ptr = '\n';
4179           else if (*ptr == '\n'
4180                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4181             {
4182               for (; p < ptr; p++)
4183                 {
4184                   if (*p == '\n')
4185                     *p = '\r';
4186                 }
4187               ptr = pend;
4188               coding->eol_type = CODING_EOL_LF;
4189               coding->symbol = saved_coding_symbol;
4190             }
4191         }
4192     }
4193 }
4194
4195 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4196    decoding, it may detect coding system and format of end-of-line if
4197    those are not yet decided.  The source should be unibyte, the
4198    result is multibyte if CODING->dst_multibyte is nonzero, else
4199    unibyte.  */
4200
4201 int
4202 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4203      struct coding_system *coding;
4204      unsigned char *source, *destination;
4205      int src_bytes, dst_bytes;
4206 {
4207   if (coding->type == coding_type_undecided)
4208     detect_coding (coding, source, src_bytes);
4209
4210   if (coding->eol_type == CODING_EOL_UNDECIDED
4211       && coding->type != coding_type_ccl)
4212     detect_eol (coding, source, src_bytes);
4213
4214   coding->produced = coding->produced_char = 0;
4215   coding->consumed = coding->consumed_char = 0;
4216   coding->errors = 0;
4217   coding->result = CODING_FINISH_NORMAL;
4218
4219   switch (coding->type)
4220     {
4221     case coding_type_sjis:
4222       decode_coding_sjis_big5 (coding, source, destination,
4223                                src_bytes, dst_bytes, 1);
4224       break;
4225
4226     case coding_type_iso2022:
4227       decode_coding_iso2022 (coding, source, destination,
4228                              src_bytes, dst_bytes);
4229       break;
4230
4231     case coding_type_big5:
4232       decode_coding_sjis_big5 (coding, source, destination,
4233                                src_bytes, dst_bytes, 0);
4234       break;
4235
4236     case coding_type_emacs_mule:
4237       decode_coding_emacs_mule (coding, source, destination,
4238                                 src_bytes, dst_bytes);
4239       break;
4240
4241     case coding_type_ccl:
4242       if (coding->spec.ccl.cr_carryover)
4243         {
4244           /* Set the CR which is not processed by the previous call of
4245              decode_eol_post_ccl in DESTINATION.  */
4246           *destination = '\r';
4247           coding->produced++;
4248           coding->produced_char++;
4249           dst_bytes--;
4250         }
4251       ccl_coding_driver (coding, source,
4252                          destination + coding->spec.ccl.cr_carryover,
4253                          src_bytes, dst_bytes, 0);
4254       if (coding->eol_type != CODING_EOL_LF)
4255         decode_eol_post_ccl (coding, destination, coding->produced);
4256       break;
4257
4258     default:
4259       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4260     }
4261
4262   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4263       && coding->mode & CODING_MODE_LAST_BLOCK
4264       && coding->consumed == src_bytes)
4265     coding->result = CODING_FINISH_NORMAL;
4266
4267   if (coding->mode & CODING_MODE_LAST_BLOCK
4268       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4269     {
4270       unsigned char *src = source + coding->consumed;
4271       unsigned char *dst = destination + coding->produced;
4272
4273       src_bytes -= coding->consumed;
4274       coding->errors++;
4275       if (COMPOSING_P (coding))
4276         DECODE_COMPOSITION_END ('1');
4277       while (src_bytes--)
4278         {
4279           int c = *src++;
4280           dst += CHAR_STRING (c, dst);
4281           coding->produced_char++;
4282         }
4283       coding->consumed = coding->consumed_char = src - source;
4284       coding->produced = dst - destination;
4285       coding->result = CODING_FINISH_NORMAL;
4286     }
4287
4288   if (!coding->dst_multibyte)
4289     {
4290       coding->produced = str_as_unibyte (destination, coding->produced);
4291       coding->produced_char = coding->produced;
4292     }
4293
4294   return coding->result;
4295 }
4296
4297 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4298    multibyteness of the source is CODING->src_multibyte, the
4299    multibyteness of the result is always unibyte.  */
4300
4301 int
4302 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4303      struct coding_system *coding;
4304      unsigned char *source, *destination;
4305      int src_bytes, dst_bytes;
4306 {
4307   coding->produced = coding->produced_char = 0;
4308   coding->consumed = coding->consumed_char = 0;
4309   coding->errors = 0;
4310   coding->result = CODING_FINISH_NORMAL;
4311
4312   switch (coding->type)
4313     {
4314     case coding_type_sjis:
4315       encode_coding_sjis_big5 (coding, source, destination,
4316                                src_bytes, dst_bytes, 1);
4317       break;
4318
4319     case coding_type_iso2022:
4320       encode_coding_iso2022 (coding, source, destination,
4321                              src_bytes, dst_bytes);
4322       break;
4323
4324     case coding_type_big5:
4325       encode_coding_sjis_big5 (coding, source, destination,
4326                                src_bytes, dst_bytes, 0);
4327       break;
4328
4329     case coding_type_emacs_mule:
4330       encode_coding_emacs_mule (coding, source, destination,
4331                                 src_bytes, dst_bytes);
4332       break;
4333
4334     case coding_type_ccl:
4335       ccl_coding_driver (coding, source, destination,
4336                          src_bytes, dst_bytes, 1);
4337       break;
4338
4339     default:
4340       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4341     }
4342
4343   if (coding->mode & CODING_MODE_LAST_BLOCK
4344       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4345     {
4346       unsigned char *src = source + coding->consumed;
4347       unsigned char *src_end = src + src_bytes;
4348       unsigned char *dst = destination + coding->produced;
4349
4350       if (coding->type == coding_type_iso2022)
4351         ENCODE_RESET_PLANE_AND_REGISTER;
4352       if (COMPOSING_P (coding))
4353         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4354       if (coding->consumed < src_bytes)
4355         {
4356           int len = src_bytes - coding->consumed;
4357
4358           BCOPY_SHORT (source + coding->consumed, dst, len);
4359           if (coding->src_multibyte)
4360             len = str_as_unibyte (dst, len);
4361           dst += len;
4362           coding->consumed = src_bytes;
4363         }
4364       coding->produced = coding->produced_char = dst - destination;
4365       coding->result = CODING_FINISH_NORMAL;
4366     }
4367
4368   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4369       && coding->consumed == src_bytes)
4370     coding->result = CODING_FINISH_NORMAL;
4371
4372   return coding->result;
4373 }
4374
4375 /* Scan text in the region between *BEG and *END (byte positions),
4376    skip characters which we don't have to decode by coding system
4377    CODING at the head and tail, then set *BEG and *END to the region
4378    of the text we actually have to convert.  The caller should move
4379    the gap out of the region in advance if the region is from a
4380    buffer.
4381
4382    If STR is not NULL, *BEG and *END are indices into STR.  */
4383
4384 static void
4385 shrink_decoding_region (beg, end, coding, str)
4386      int *beg, *end;
4387      struct coding_system *coding;
4388      unsigned char *str;
4389 {
4390   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4391   int eol_conversion;
4392   Lisp_Object translation_table;
4393
4394   if (coding->type == coding_type_ccl
4395       || coding->type == coding_type_undecided
4396       || coding->eol_type != CODING_EOL_LF
4397       || !NILP (coding->post_read_conversion)
4398       || coding->composing != COMPOSITION_DISABLED)
4399     {
4400       /* We can't skip any data.  */
4401       return;
4402     }
4403   if (coding->type == coding_type_no_conversion
4404       || coding->type == coding_type_raw_text
4405       || coding->type == coding_type_emacs_mule)
4406     {
4407       /* We need no conversion, but don't have to skip any data here.
4408          Decoding routine handles them effectively anyway.  */
4409       return;
4410     }
4411
4412   translation_table = coding->translation_table_for_decode;
4413   if (NILP (translation_table) && !NILP (Venable_character_translation))
4414     translation_table = Vstandard_translation_table_for_decode;
4415   if (CHAR_TABLE_P (translation_table))
4416     {
4417       int i;
4418       for (i = 0; i < 128; i++)
4419         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4420           break;
4421       if (i < 128)
4422         /* Some ASCII character should be translated.  We give up
4423            shrinking.  */
4424         return;
4425     }
4426
4427   if (coding->heading_ascii >= 0)
4428     /* Detection routine has already found how much we can skip at the
4429        head.  */
4430     *beg += coding->heading_ascii;
4431
4432   if (str)
4433     {
4434       begp_orig = begp = str + *beg;
4435       endp_orig = endp = str + *end;
4436     }
4437   else
4438     {
4439       begp_orig = begp = BYTE_POS_ADDR (*beg);
4440       endp_orig = endp = begp + *end - *beg;
4441     }
4442
4443   eol_conversion = (coding->eol_type == CODING_EOL_CR
4444                     || coding->eol_type == CODING_EOL_CRLF);
4445
4446   switch (coding->type)
4447     {
4448     case coding_type_sjis:
4449     case coding_type_big5:
4450       /* We can skip all ASCII characters at the head.  */
4451       if (coding->heading_ascii < 0)
4452         {
4453           if (eol_conversion)
4454             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4455           else
4456             while (begp < endp && *begp < 0x80) begp++;
4457         }
4458       /* We can skip all ASCII characters at the tail except for the
4459          second byte of SJIS or BIG5 code.  */
4460       if (eol_conversion)
4461         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4462       else
4463         while (begp < endp && endp[-1] < 0x80) endp--;
4464       /* Do not consider LF as ascii if preceded by CR, since that
4465          confuses eol decoding. */
4466       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4467         endp++;
4468       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4469         endp++;
4470       break;
4471
4472     case coding_type_iso2022:
4473       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4474         /* We can't skip any data.  */
4475         break;
4476       if (coding->heading_ascii < 0)
4477         {
4478           /* We can skip all ASCII characters at the head except for a
4479              few control codes.  */
4480           while (begp < endp && (c = *begp) < 0x80
4481                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4482                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4483                  && (!eol_conversion || c != ISO_CODE_LF))
4484             begp++;
4485         }
4486       switch (coding->category_idx)
4487         {
4488         case CODING_CATEGORY_IDX_ISO_8_1:
4489         case CODING_CATEGORY_IDX_ISO_8_2:
4490           /* We can skip all ASCII characters at the tail.  */
4491           if (eol_conversion)
4492             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4493           else
4494             while (begp < endp && endp[-1] < 0x80) endp--;
4495           /* Do not consider LF as ascii if preceded by CR, since that
4496              confuses eol decoding. */
4497           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4498             endp++;
4499           break;
4500
4501         case CODING_CATEGORY_IDX_ISO_7:
4502         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4503           {
4504             /* We can skip all charactes at the tail except for 8-bit
4505                codes and ESC and the following 2-byte at the tail.  */
4506             unsigned char *eight_bit = NULL;
4507
4508             if (eol_conversion)
4509               while (begp < endp
4510                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4511                 {
4512                   if (!eight_bit && c & 0x80) eight_bit = endp;
4513                   endp--;
4514                 }
4515             else
4516               while (begp < endp
4517                      && (c = endp[-1]) != ISO_CODE_ESC)
4518                 {
4519                   if (!eight_bit && c & 0x80) eight_bit = endp;
4520                   endp--;
4521                 }
4522             /* Do not consider LF as ascii if preceded by CR, since that
4523                confuses eol decoding. */
4524             if (begp < endp && endp < endp_orig
4525                 && endp[-1] == '\r' && endp[0] == '\n')
4526               endp++;
4527             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4528               {
4529                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4530                   /* This is an ASCII designation sequence.  We can
4531                      surely skip the tail.  But, if we have
4532                      encountered an 8-bit code, skip only the codes
4533                      after that.  */
4534                   endp = eight_bit ? eight_bit : endp + 2;
4535                 else
4536                   /* Hmmm, we can't skip the tail.  */
4537                   endp = endp_orig;
4538               }
4539             else if (eight_bit)
4540               endp = eight_bit;
4541           }
4542         }
4543       break;
4544
4545     default:
4546       abort ();
4547     }
4548   *beg += begp - begp_orig;
4549   *end += endp - endp_orig;
4550   return;
4551 }
4552
4553 /* Like shrink_decoding_region but for encoding.  */
4554
4555 static void
4556 shrink_encoding_region (beg, end, coding, str)
4557      int *beg, *end;
4558      struct coding_system *coding;
4559      unsigned char *str;
4560 {
4561   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4562   int eol_conversion;
4563   Lisp_Object translation_table;
4564
4565   if (coding->type == coding_type_ccl
4566       || coding->eol_type == CODING_EOL_CRLF
4567       || coding->eol_type == CODING_EOL_CR
4568       || coding->cmp_data && coding->cmp_data->used > 0)
4569     {
4570       /* We can't skip any data.  */
4571       return;
4572     }
4573   if (coding->type == coding_type_no_conversion
4574       || coding->type == coding_type_raw_text
4575       || coding->type == coding_type_emacs_mule
4576       || coding->type == coding_type_undecided)
4577     {
4578       /* We need no conversion, but don't have to skip any data here.
4579          Encoding routine handles them effectively anyway.  */
4580       return;
4581     }
4582
4583   translation_table = coding->translation_table_for_encode;
4584   if (NILP (translation_table) && !NILP (Venable_character_translation))
4585     translation_table = Vstandard_translation_table_for_encode;
4586   if (CHAR_TABLE_P (translation_table))
4587     {
4588       int i;
4589       for (i = 0; i < 128; i++)
4590         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4591           break;
4592       if (i < 128)
4593         /* Some ASCII character should be tranlsated.  We give up
4594            shrinking.  */
4595         return;
4596     }
4597
4598   if (str)
4599     {
4600       begp_orig = begp = str + *beg;
4601       endp_orig = endp = str + *end;
4602     }
4603   else
4604     {
4605       begp_orig = begp = BYTE_POS_ADDR (*beg);
4606       endp_orig = endp = begp + *end - *beg;
4607     }
4608
4609   eol_conversion = (coding->eol_type == CODING_EOL_CR
4610                     || coding->eol_type == CODING_EOL_CRLF);
4611
4612   /* Here, we don't have to check coding->pre_write_conversion because
4613      the caller is expected to have handled it already.  */
4614   switch (coding->type)
4615     {
4616     case coding_type_iso2022:
4617       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4618         /* We can't skip any data.  */
4619         break;
4620       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4621         {
4622           unsigned char *bol = begp;
4623           while (begp < endp && *begp < 0x80)
4624             {
4625               begp++;
4626               if (begp[-1] == '\n')
4627                 bol = begp;
4628             }
4629           begp = bol;
4630           goto label_skip_tail;
4631         }
4632       /* fall down ... */
4633
4634     case coding_type_sjis:
4635     case coding_type_big5:
4636       /* We can skip all ASCII characters at the head and tail.  */
4637       if (eol_conversion)
4638         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4639       else
4640         while (begp < endp && *begp < 0x80) begp++;
4641     label_skip_tail:
4642       if (eol_conversion)
4643         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4644       else
4645         while (begp < endp && *(endp - 1) < 0x80) endp--;
4646       break;
4647
4648     default:
4649       abort ();
4650     }
4651
4652   *beg += begp - begp_orig;
4653   *end += endp - endp_orig;
4654   return;
4655 }
4656
4657 /* As shrinking conversion region requires some overhead, we don't try
4658    shrinking if the length of conversion region is less than this
4659    value.  */
4660 static int shrink_conversion_region_threshhold = 1024;
4661
4662 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4663   do {                                                                  \
4664     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4665       {                                                                 \
4666         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4667         else shrink_decoding_region (beg, end, coding, str);            \
4668       }                                                                 \
4669   } while (0)
4670
4671 static Lisp_Object
4672 code_convert_region_unwind (dummy)
4673      Lisp_Object dummy;
4674 {
4675   inhibit_pre_post_conversion = 0;
4676   return Qnil;
4677 }
4678
4679 /* Store information about all compositions in the range FROM and TO
4680    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4681    buffer or a string, defaults to the current buffer.  */
4682
4683 void
4684 coding_save_composition (coding, from, to, obj)
4685      struct coding_system *coding;
4686      int from, to;
4687      Lisp_Object obj;
4688 {
4689   Lisp_Object prop;
4690   int start, end;
4691
4692   if (coding->composing == COMPOSITION_DISABLED)
4693     return;
4694   if (!coding->cmp_data)
4695     coding_allocate_composition_data (coding, from);
4696   if (!find_composition (from, to, &start, &end, &prop, obj)
4697       || end > to)
4698     return;
4699   if (start < from
4700       && (!find_composition (end, to, &start, &end, &prop, obj)
4701           || end > to))
4702     return;
4703   coding->composing = COMPOSITION_NO;
4704   do
4705     {
4706       if (COMPOSITION_VALID_P (start, end, prop))
4707         {
4708           enum composition_method method = COMPOSITION_METHOD (prop);
4709           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4710               >= COMPOSITION_DATA_SIZE)
4711             coding_allocate_composition_data (coding, from);
4712           /* For relative composition, we remember start and end
4713              positions, for the other compositions, we also remember
4714              components.  */
4715           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4716           if (method != COMPOSITION_RELATIVE)
4717             {
4718               /* We must store a*/
4719               Lisp_Object val, ch;
4720
4721               val = COMPOSITION_COMPONENTS (prop);
4722               if (CONSP (val))
4723                 while (CONSP (val))
4724                   {
4725                     ch = XCAR (val), val = XCDR (val);
4726                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4727                   }
4728               else if (VECTORP (val) || STRINGP (val))
4729                 {
4730                   int len = (VECTORP (val)
4731                              ? XVECTOR (val)->size : XSTRING (val)->size);
4732                   int i;
4733                   for (i = 0; i < len; i++)
4734                     {
4735                       ch = (STRINGP (val)
4736                             ? Faref (val, make_number (i))
4737                             : XVECTOR (val)->contents[i]);
4738                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4739                     }
4740                 }
4741               else              /* INTEGERP (val) */
4742                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4743             }
4744           CODING_ADD_COMPOSITION_END (coding, end - from);
4745         }
4746       start = end;
4747     }
4748   while (start < to
4749          && find_composition (start, to, &start, &end, &prop, obj)
4750          && end <= to);
4751
4752   /* Make coding->cmp_data point to the first memory block.  */
4753   while (coding->cmp_data->prev)
4754     coding->cmp_data = coding->cmp_data->prev;
4755   coding->cmp_data_start = 0;
4756 }
4757
4758 /* Reflect the saved information about compositions to OBJ.
4759    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4760    is a buffer or a string, defaults to the current buffer.  */
4761
4762 void
4763 coding_restore_composition (coding, obj)
4764      struct coding_system *coding;
4765      Lisp_Object obj;
4766 {
4767   struct composition_data *cmp_data = coding->cmp_data;
4768
4769   if (!cmp_data)
4770     return;
4771
4772   while (cmp_data->prev)
4773     cmp_data = cmp_data->prev;
4774
4775   while (cmp_data)
4776     {
4777       int i;
4778
4779       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
4780            i += cmp_data->data[i])
4781         {
4782           int *data = cmp_data->data + i;
4783           enum composition_method method = (enum composition_method) data[3];
4784           Lisp_Object components;
4785
4786           if (method == COMPOSITION_RELATIVE)
4787             components = Qnil;
4788           else
4789             {
4790               int len = data[0] - 4, j;
4791               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4792
4793               for (j = 0; j < len; j++)
4794                 args[j] = make_number (data[4 + j]);
4795               components = (method == COMPOSITION_WITH_ALTCHARS
4796                             ? Fstring (len, args) : Fvector (len, args));
4797             }
4798           compose_text (data[1], data[2], components, Qnil, obj);
4799         }
4800       cmp_data = cmp_data->next;
4801     }
4802 }
4803
4804 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4805    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4806    coding system CODING, and return the status code of code conversion
4807    (currently, this value has no meaning).
4808
4809    How many characters (and bytes) are converted to how many
4810    characters (and bytes) are recorded in members of the structure
4811    CODING.
4812
4813    If REPLACE is nonzero, we do various things as if the original text
4814    is deleted and a new text is inserted.  See the comments in
4815    replace_range (insdel.c) to know what we are doing.
4816
4817    If REPLACE is zero, it is assumed that the source text is unibyte.
4818    Otherwize, it is assumed that the source text is multibyte.  */
4819
4820 int
4821 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4822      int from, from_byte, to, to_byte, encodep, replace;
4823      struct coding_system *coding;
4824 {
4825   int len = to - from, len_byte = to_byte - from_byte;
4826   int require, inserted, inserted_byte;
4827   int head_skip, tail_skip, total_skip = 0;
4828   Lisp_Object saved_coding_symbol;
4829   int first = 1;
4830   unsigned char *src, *dst;
4831   Lisp_Object deletion;
4832   int orig_point = PT, orig_len = len;
4833   int prev_Z;
4834   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4835
4836   deletion = Qnil;
4837   saved_coding_symbol = Qnil;
4838
4839   if (from < PT && PT < to)
4840     {
4841       TEMP_SET_PT_BOTH (from, from_byte);
4842       orig_point = from;
4843     }
4844
4845   if (replace)
4846     {
4847       int saved_from = from;
4848       int saved_inhibit_modification_hooks;
4849
4850       prepare_to_modify_buffer (from, to, &from);
4851       if (saved_from != from)
4852         {
4853           to = from + len;
4854           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4855           len_byte = to_byte - from_byte;
4856         }
4857
4858       /* The code conversion routine can not preserve text properties
4859          for now.  So, we must remove all text properties in the
4860          region.  Here, we must suppress all modification hooks.  */
4861       saved_inhibit_modification_hooks = inhibit_modification_hooks;
4862       inhibit_modification_hooks = 1;
4863       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4864       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4865     }
4866
4867   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4868     {
4869       /* We must detect encoding of text and eol format.  */
4870
4871       if (from < GPT && to > GPT)
4872         move_gap_both (from, from_byte);
4873       if (coding->type == coding_type_undecided)
4874         {
4875           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4876           if (coding->type == coding_type_undecided)
4877             {
4878               /* It seems that the text contains only ASCII, but we
4879                  should not leave it undecided because the deeper
4880                  decoding routine (decode_coding) tries to detect the
4881                  encodings again in vain.  */
4882               coding->type = coding_type_emacs_mule;
4883               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
4884             }
4885         }
4886       if (coding->eol_type == CODING_EOL_UNDECIDED
4887           && coding->type != coding_type_ccl)
4888         {
4889           saved_coding_symbol = coding->symbol;
4890           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4891           if (coding->eol_type == CODING_EOL_UNDECIDED)
4892             coding->eol_type = CODING_EOL_LF;
4893           /* We had better recover the original eol format if we
4894              encounter an inconsitent eol format while decoding.  */
4895           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4896         }
4897     }
4898
4899   /* Now we convert the text.  */
4900
4901   /* For encoding, we must process pre-write-conversion in advance.  */
4902   if (! inhibit_pre_post_conversion
4903       && encodep
4904       && SYMBOLP (coding->pre_write_conversion)
4905       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4906     {
4907       /* The function in pre-write-conversion may put a new text in a
4908          new buffer.  */
4909       struct buffer *prev = current_buffer;
4910       Lisp_Object new;
4911       int count = specpdl_ptr - specpdl;
4912
4913       record_unwind_protect (code_convert_region_unwind, Qnil);
4914       /* We should not call any more pre-write/post-read-conversion
4915          functions while this pre-write-conversion is running.  */
4916       inhibit_pre_post_conversion = 1;
4917       call2 (coding->pre_write_conversion,
4918              make_number (from), make_number (to));
4919       inhibit_pre_post_conversion = 0;
4920       /* Discard the unwind protect.  */
4921       specpdl_ptr--;
4922
4923       if (current_buffer != prev)
4924         {
4925           len = ZV - BEGV;
4926           new = Fcurrent_buffer ();
4927           set_buffer_internal_1 (prev);
4928           del_range_2 (from, from_byte, to, to_byte, 0);
4929           TEMP_SET_PT_BOTH (from, from_byte);
4930           insert_from_buffer (XBUFFER (new), 1, len, 0);
4931           Fkill_buffer (new);
4932           if (orig_point >= to)
4933             orig_point += len - orig_len;
4934           else if (orig_point > from)
4935             orig_point = from;
4936           orig_len = len;
4937           to = from + len;
4938           from_byte = CHAR_TO_BYTE (from);
4939           to_byte = CHAR_TO_BYTE (to);
4940           len_byte = to_byte - from_byte;
4941           TEMP_SET_PT_BOTH (from, from_byte);
4942         }
4943     }
4944
4945   if (replace)
4946     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4947
4948   if (coding->composing != COMPOSITION_DISABLED)
4949     {
4950       if (encodep)
4951         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4952       else
4953         coding_allocate_composition_data (coding, from);
4954     }
4955
4956   /* Try to skip the heading and tailing ASCIIs.  */
4957   if (coding->type != coding_type_ccl)
4958     {
4959       int from_byte_orig = from_byte, to_byte_orig = to_byte;
4960
4961       if (from < GPT && GPT < to)
4962         move_gap_both (from, from_byte);
4963       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4964       if (from_byte == to_byte
4965           && (encodep || NILP (coding->post_read_conversion))
4966           && ! CODING_REQUIRE_FLUSHING (coding))
4967         {
4968           coding->produced = len_byte;
4969           coding->produced_char = len;
4970           if (!replace)
4971             /* We must record and adjust for this new text now.  */
4972             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4973           return 0;
4974         }
4975
4976       head_skip = from_byte - from_byte_orig;
4977       tail_skip = to_byte_orig - to_byte;
4978       total_skip = head_skip + tail_skip;
4979       from += head_skip;
4980       to -= tail_skip;
4981       len -= total_skip; len_byte -= total_skip;
4982     }
4983
4984   /* For converion, we must put the gap before the text in addition to
4985      making the gap larger for efficient decoding.  The required gap
4986      size starts from 2000 which is the magic number used in make_gap.
4987      But, after one batch of conversion, it will be incremented if we
4988      find that it is not enough .  */
4989   require = 2000;
4990
4991   if (GAP_SIZE  < require)
4992     make_gap (require - GAP_SIZE);
4993   move_gap_both (from, from_byte);
4994
4995   inserted = inserted_byte = 0;
4996
4997   GAP_SIZE += len_byte;
4998   ZV -= len;
4999   Z -= len;
5000   ZV_BYTE -= len_byte;
5001   Z_BYTE -= len_byte;
5002
5003   if (GPT - BEG < BEG_UNCHANGED)
5004     BEG_UNCHANGED = GPT - BEG;
5005   if (Z - GPT < END_UNCHANGED)
5006     END_UNCHANGED = Z - GPT;
5007
5008   if (!encodep && coding->src_multibyte)
5009     {
5010       /* Decoding routines expects that the source text is unibyte.
5011          We must convert 8-bit characters of multibyte form to
5012          unibyte.  */
5013       int len_byte_orig = len_byte;
5014       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5015       if (len_byte < len_byte_orig)
5016         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5017                     len_byte);
5018       coding->src_multibyte = 0;
5019     }
5020
5021   for (;;)
5022     {
5023       int result;
5024
5025       /* The buffer memory is now:
5026          +--------+converted-text+---------+-------original-text-------+---+
5027          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5028                   |<---------------------- GAP ----------------------->|  */
5029       src = GAP_END_ADDR - len_byte;
5030       dst = GPT_ADDR + inserted_byte;
5031
5032       if (encodep)
5033         result = encode_coding (coding, src, dst, len_byte, 0);
5034       else
5035         result = decode_coding (coding, src, dst, len_byte, 0);
5036
5037       /* The buffer memory is now:
5038          +--------+-------converted-text----+--+------original-text----+---+
5039          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5040                   |<---------------------- GAP ----------------------->|  */
5041
5042       inserted += coding->produced_char;
5043       inserted_byte += coding->produced;
5044       len_byte -= coding->consumed;
5045
5046       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5047         {
5048           coding_allocate_composition_data (coding, from + inserted);
5049           continue;
5050         }
5051
5052       src += coding->consumed;
5053       dst += coding->produced;
5054
5055       if (result == CODING_FINISH_NORMAL)
5056         {
5057           src += len_byte;
5058           break;
5059         }
5060       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5061         {
5062           unsigned char *pend = dst, *p = pend - inserted_byte;
5063           Lisp_Object eol_type;
5064
5065           /* Encode LFs back to the original eol format (CR or CRLF).  */
5066           if (coding->eol_type == CODING_EOL_CR)
5067             {
5068               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5069             }
5070           else
5071             {
5072               int count = 0;
5073
5074               while (p < pend) if (*p++ == '\n') count++;
5075               if (src - dst < count)
5076                 {
5077                   /* We don't have sufficient room for encoding LFs
5078                      back to CRLF.  We must record converted and
5079                      not-yet-converted text back to the buffer
5080                      content, enlarge the gap, then record them out of
5081                      the buffer contents again.  */
5082                   int add = len_byte + inserted_byte;
5083
5084                   GAP_SIZE -= add;
5085                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5086                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5087                   make_gap (count - GAP_SIZE);
5088                   GAP_SIZE += add;
5089                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5090                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5091                   /* Don't forget to update SRC, DST, and PEND.  */
5092                   src = GAP_END_ADDR - len_byte;
5093                   dst = GPT_ADDR + inserted_byte;
5094                   pend = dst;
5095                 }
5096               inserted += count;
5097               inserted_byte += count;
5098               coding->produced += count;
5099               p = dst = pend + count;
5100               while (count)
5101                 {
5102                   *--p = *--pend;
5103                   if (*p == '\n') count--, *--p = '\r';
5104                 }
5105             }
5106
5107           /* Suppress eol-format conversion in the further conversion.  */
5108           coding->eol_type = CODING_EOL_LF;
5109
5110           /* Set the coding system symbol to that for Unix-like EOL.  */
5111           eol_type = Fget (saved_coding_symbol, Qeol_type);
5112           if (VECTORP (eol_type)
5113               && XVECTOR (eol_type)->size == 3
5114               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5115             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5116           else
5117             coding->symbol = saved_coding_symbol;
5118
5119           continue;
5120         }
5121       if (len_byte <= 0)
5122         {
5123           if (coding->type != coding_type_ccl
5124               || coding->mode & CODING_MODE_LAST_BLOCK)
5125             break;
5126           coding->mode |= CODING_MODE_LAST_BLOCK;
5127           continue;
5128         }
5129       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5130         {
5131           /* The source text ends in invalid codes.  Let's just
5132              make them valid buffer contents, and finish conversion.  */
5133           inserted += len_byte;
5134           inserted_byte += len_byte;
5135           while (len_byte--)
5136             *dst++ = *src++;
5137           break;
5138         }
5139       if (result == CODING_FINISH_INTERRUPT)
5140         {
5141           /* The conversion procedure was interrupted by a user.  */
5142           break;
5143         }
5144       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5145       if (coding->consumed < 1)
5146         {
5147           /* It's quite strange to require more memory without
5148              consuming any bytes.  Perhaps CCL program bug.  */
5149           break;
5150         }
5151       if (first)
5152         {
5153           /* We have just done the first batch of conversion which was
5154              stoped because of insufficient gap.  Let's reconsider the
5155              required gap size (i.e. SRT - DST) now.
5156
5157              We have converted ORIG bytes (== coding->consumed) into
5158              NEW bytes (coding->produced).  To convert the remaining
5159              LEN bytes, we may need REQUIRE bytes of gap, where:
5160                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5161                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5162              Here, we are sure that NEW >= ORIG.  */
5163           float ratio = coding->produced - coding->consumed;
5164           ratio /= coding->consumed;
5165           require = len_byte * ratio;
5166           first = 0;
5167         }
5168       if ((src - dst) < (require + 2000))
5169         {
5170           /* See the comment above the previous call of make_gap.  */
5171           int add = len_byte + inserted_byte;
5172
5173           GAP_SIZE -= add;
5174           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5175           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5176           make_gap (require + 2000);
5177           GAP_SIZE += add;
5178           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5179           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5180         }
5181     }
5182   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5183
5184   if (encodep && coding->dst_multibyte)
5185     {
5186       /* The output is unibyte.  We must convert 8-bit characters to
5187          multibyte form.  */
5188       if (inserted_byte * 2 > GAP_SIZE)
5189         {
5190           GAP_SIZE -= inserted_byte;
5191           ZV += inserted_byte; Z += inserted_byte;
5192           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5193           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5194           make_gap (inserted_byte - GAP_SIZE);
5195           GAP_SIZE += inserted_byte;
5196           ZV -= inserted_byte; Z -= inserted_byte;
5197           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5198           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5199         }
5200       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5201     }
5202
5203   /* If we have shrinked the conversion area, adjust it now.  */
5204   if (total_skip > 0)
5205     {
5206       if (tail_skip > 0)
5207         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5208       inserted += total_skip; inserted_byte += total_skip;
5209       GAP_SIZE += total_skip;
5210       GPT -= head_skip; GPT_BYTE -= head_skip;
5211       ZV -= total_skip; ZV_BYTE -= total_skip;
5212       Z -= total_skip; Z_BYTE -= total_skip;
5213       from -= head_skip; from_byte -= head_skip;
5214       to += tail_skip; to_byte += tail_skip;
5215     }
5216
5217   prev_Z = Z;
5218   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5219   inserted = Z - prev_Z;
5220
5221   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5222     coding_restore_composition (coding, Fcurrent_buffer ());
5223   coding_free_composition_data (coding);
5224
5225   if (! inhibit_pre_post_conversion
5226       && ! encodep && ! NILP (coding->post_read_conversion))
5227     {
5228       Lisp_Object val;
5229       int count = specpdl_ptr - specpdl;
5230
5231       if (from != PT)
5232         TEMP_SET_PT_BOTH (from, from_byte);
5233       prev_Z = Z;
5234       record_unwind_protect (code_convert_region_unwind, Qnil);
5235       /* We should not call any more pre-write/post-read-conversion
5236          functions while this post-read-conversion is running.  */
5237       inhibit_pre_post_conversion = 1;
5238       val = call1 (coding->post_read_conversion, make_number (inserted));
5239       inhibit_pre_post_conversion = 0;
5240       /* Discard the unwind protect.  */
5241       specpdl_ptr--;
5242       CHECK_NUMBER (val, 0);
5243       inserted += Z - prev_Z;
5244     }
5245
5246   if (orig_point >= from)
5247     {
5248       if (orig_point >= from + orig_len)
5249         orig_point += inserted - orig_len;
5250       else
5251         orig_point = from;
5252       TEMP_SET_PT (orig_point);
5253     }
5254
5255   if (replace)
5256     {
5257       signal_after_change (from, to - from, inserted);
5258       update_compositions (from, from + inserted, CHECK_BORDER);
5259     }
5260
5261   {
5262     coding->consumed = to_byte - from_byte;
5263     coding->consumed_char = to - from;
5264     coding->produced = inserted_byte;
5265     coding->produced_char = inserted;
5266   }
5267
5268   return 0;
5269 }
5270
5271 Lisp_Object
5272 run_pre_post_conversion_on_str (str, coding, encodep)
5273      Lisp_Object str;
5274      struct coding_system *coding;
5275      int encodep;
5276 {
5277   int count = specpdl_ptr - specpdl;
5278   struct gcpro gcpro1;
5279   struct buffer *prev = current_buffer;
5280   int multibyte = STRING_MULTIBYTE (str);
5281
5282   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5283   record_unwind_protect (code_convert_region_unwind, Qnil);
5284   GCPRO1 (str);
5285   temp_output_buffer_setup (" *code-converting-work*");
5286   set_buffer_internal (XBUFFER (Vstandard_output));
5287   /* We must insert the contents of STR as is without
5288      unibyte<->multibyte conversion.  For that, we adjust the
5289      multibyteness of the working buffer to that of STR.  */
5290   Ferase_buffer ();
5291   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5292   insert_from_string (str, 0, 0,
5293                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5294   UNGCPRO;
5295   inhibit_pre_post_conversion = 1;
5296   if (encodep)
5297     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5298   else
5299     {
5300       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5301       call1 (coding->post_read_conversion, make_number (Z - BEG));
5302     }
5303   inhibit_pre_post_conversion = 0;
5304   str = make_buffer_string (BEG, Z, 1);
5305   return unbind_to (count, str);
5306 }
5307
5308 Lisp_Object
5309 decode_coding_string (str, coding, nocopy)
5310      Lisp_Object str;
5311      struct coding_system *coding;
5312      int nocopy;
5313 {
5314   int len;
5315   struct conversion_buffer buf;
5316   int from, to, to_byte;
5317   struct gcpro gcpro1;
5318   Lisp_Object saved_coding_symbol;
5319   int result;
5320   int require_decoding;
5321   int shrinked_bytes = 0;
5322   Lisp_Object newstr;
5323   int consumed, consumed_char, produced, produced_char;
5324
5325   from = 0;
5326   to = XSTRING (str)->size;
5327   to_byte = STRING_BYTES (XSTRING (str));
5328
5329   saved_coding_symbol = Qnil;
5330   coding->src_multibyte = STRING_MULTIBYTE (str);
5331   coding->dst_multibyte = 1;
5332   if (CODING_REQUIRE_DETECTION (coding))
5333     {
5334       /* See the comments in code_convert_region.  */
5335       if (coding->type == coding_type_undecided)
5336         {
5337           detect_coding (coding, XSTRING (str)->data, to_byte);
5338           if (coding->type == coding_type_undecided)
5339             coding->type = coding_type_emacs_mule;
5340         }
5341       if (coding->eol_type == CODING_EOL_UNDECIDED
5342           && coding->type != coding_type_ccl)
5343         {
5344           saved_coding_symbol = coding->symbol;
5345           detect_eol (coding, XSTRING (str)->data, to_byte);
5346           if (coding->eol_type == CODING_EOL_UNDECIDED)
5347             coding->eol_type = CODING_EOL_LF;
5348           /* We had better recover the original eol format if we
5349              encounter an inconsitent eol format while decoding.  */
5350           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5351         }
5352     }
5353
5354   if (coding->type == coding_type_no_conversion
5355       || coding->type == coding_type_raw_text)
5356     coding->dst_multibyte = 0;
5357
5358   require_decoding = CODING_REQUIRE_DECODING (coding);
5359
5360   if (STRING_MULTIBYTE (str))
5361     {
5362       /* Decoding routines expect the source text to be unibyte.  */
5363       str = Fstring_as_unibyte (str);
5364       to_byte = STRING_BYTES (XSTRING (str));
5365       nocopy = 1;
5366       coding->src_multibyte = 0;
5367     }
5368
5369   /* Try to skip the heading and tailing ASCIIs.  */
5370   if (require_decoding && coding->type != coding_type_ccl)
5371     {
5372       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5373                                 0);
5374       if (from == to_byte)
5375         require_decoding = 0;
5376       shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte);
5377     }
5378
5379   if (!require_decoding)
5380     {
5381       coding->consumed = STRING_BYTES (XSTRING (str));
5382       coding->consumed_char = XSTRING (str)->size;
5383       if (coding->dst_multibyte)
5384         {
5385           str = Fstring_as_multibyte (str);
5386           nocopy = 1;
5387         }
5388       coding->produced = STRING_BYTES (XSTRING (str));
5389       coding->produced_char = XSTRING (str)->size;
5390       return (nocopy ? str : Fcopy_sequence (str));
5391     }
5392
5393   if (coding->composing != COMPOSITION_DISABLED)
5394     coding_allocate_composition_data (coding, from);
5395   len = decoding_buffer_size (coding, to_byte - from);
5396   allocate_conversion_buffer (buf, len);
5397
5398   consumed = consumed_char = produced = produced_char = 0;
5399   while (1)
5400     {
5401       result = decode_coding (coding, XSTRING (str)->data + from + consumed,
5402                               buf.data + produced, to_byte - from - consumed,
5403                               buf.size - produced);
5404       consumed += coding->consumed;
5405       consumed_char += coding->consumed_char;
5406       produced += coding->produced;
5407       produced_char += coding->produced_char;
5408       if (result == CODING_FINISH_NORMAL
5409           || (result == CODING_FINISH_INSUFFICIENT_SRC
5410               && coding->consumed == 0))
5411         break;
5412       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5413         coding_allocate_composition_data (coding, from + produced_char);
5414       else if (result == CODING_FINISH_INSUFFICIENT_DST)
5415         extend_conversion_buffer (&buf);
5416       else if (result == CODING_FINISH_INCONSISTENT_EOL)
5417         {
5418           /* Recover the original EOL format.  */
5419           if (coding->eol_type == CODING_EOL_CR)
5420             {
5421               unsigned char *p;
5422               for (p = buf.data; p < buf.data + produced; p++)
5423                 if (*p == '\n') *p = '\r';
5424             }
5425           else if (coding->eol_type == CODING_EOL_CRLF)
5426             {
5427               int num_eol = 0;
5428               unsigned char *p0, *p1;
5429               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
5430                 if (*p0 == '\n') num_eol++;
5431               if (produced + num_eol >= buf.size)
5432                 extend_conversion_buffer (&buf);
5433               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
5434                 {
5435                   *--p1 = *--p0;
5436                   if (*p0 == '\n') *--p1 = '\r';
5437                 }
5438               produced += num_eol;
5439               produced_char += num_eol;
5440             }
5441           coding->eol_type = CODING_EOL_LF;
5442           coding->symbol = saved_coding_symbol;
5443         }
5444     }
5445
5446   coding->consumed = consumed;
5447   coding->consumed_char = consumed_char;
5448   coding->produced = produced;
5449   coding->produced_char = produced_char;
5450
5451   if (coding->dst_multibyte)
5452     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
5453                                            produced + shrinked_bytes);
5454   else
5455     newstr = make_uninit_string (produced + shrinked_bytes);
5456   if (from > 0)
5457     bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from);
5458   bcopy (buf.data, XSTRING (newstr)->data + from, produced);
5459   if (shrinked_bytes > from)
5460     bcopy (XSTRING (str)->data + to_byte,
5461            XSTRING (newstr)->data + from + produced,
5462            shrinked_bytes - from);
5463   free_conversion_buffer (&buf);
5464
5465   if (coding->cmp_data && coding->cmp_data->used)
5466     coding_restore_composition (coding, newstr);
5467   coding_free_composition_data (coding);
5468
5469   if (SYMBOLP (coding->post_read_conversion)
5470       && !NILP (Ffboundp (coding->post_read_conversion)))
5471     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
5472
5473   return newstr;
5474 }
5475
5476 Lisp_Object
5477 encode_coding_string (str, coding, nocopy)
5478      Lisp_Object str;
5479      struct coding_system *coding;
5480      int nocopy;
5481 {
5482   int len;
5483   struct conversion_buffer buf;
5484   int from, to, to_byte;
5485   struct gcpro gcpro1;
5486   Lisp_Object saved_coding_symbol;
5487   int result;
5488   int shrinked_bytes = 0;
5489   Lisp_Object newstr;
5490   int consumed, consumed_char, produced, produced_char;
5491
5492   if (SYMBOLP (coding->pre_write_conversion)
5493       && !NILP (Ffboundp (coding->pre_write_conversion)))
5494     str = run_pre_post_conversion_on_str (str, coding, 1);
5495
5496   from = 0;
5497   to = XSTRING (str)->size;
5498   to_byte = STRING_BYTES (XSTRING (str));
5499
5500   saved_coding_symbol = Qnil;
5501
5502   /* Encoding routines determine the multibyteness of the source text
5503      by coding->src_multibyte.  */
5504   coding->src_multibyte = STRING_MULTIBYTE (str);
5505   coding->dst_multibyte = 0;
5506   if (! CODING_REQUIRE_ENCODING (coding))
5507     {
5508       coding->consumed = STRING_BYTES (XSTRING (str));
5509       coding->consumed_char = XSTRING (str)->size;
5510       if (STRING_MULTIBYTE (str))
5511         {
5512           str = Fstring_as_unibyte (str);
5513           nocopy = 1;
5514         }
5515       coding->produced = STRING_BYTES (XSTRING (str));
5516       coding->produced_char = XSTRING (str)->size;
5517       return (nocopy ? str : Fcopy_sequence (str));
5518     }
5519
5520   if (coding->composing != COMPOSITION_DISABLED)
5521     coding_save_composition (coding, from, to, str);
5522
5523   /* Try to skip the heading and tailing ASCIIs.  */
5524   if (coding->type != coding_type_ccl)
5525     {
5526       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5527                                 1);
5528       if (from == to_byte)
5529         return (nocopy ? str : Fcopy_sequence (str));
5530       shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte);
5531     }
5532
5533   len = encoding_buffer_size (coding, to_byte - from);
5534   allocate_conversion_buffer (buf, len);
5535
5536   consumed = consumed_char = produced = produced_char = 0;
5537   while (1)
5538     {
5539       result = encode_coding (coding, XSTRING (str)->data + from + consumed,
5540                               buf.data + produced, to_byte - from - consumed,
5541                               buf.size - produced);
5542       consumed += coding->consumed;
5543       consumed_char += coding->consumed_char;
5544       produced += coding->produced;
5545       produced_char += coding->produced_char;
5546       if (result == CODING_FINISH_NORMAL
5547           || (result == CODING_FINISH_INSUFFICIENT_SRC
5548               && coding->consumed == 0))
5549         break;
5550       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
5551       extend_conversion_buffer (&buf);
5552     }
5553
5554   coding->consumed = consumed;
5555   coding->consumed_char = consumed_char;
5556   coding->produced = produced;
5557   coding->produced_char = produced_char;
5558
5559   newstr = make_uninit_string (produced + shrinked_bytes);
5560   if (from > 0)
5561     bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from);
5562   bcopy (buf.data, XSTRING (newstr)->data + from, produced);
5563   if (shrinked_bytes > from)
5564     bcopy (XSTRING (str)->data + to_byte,
5565            XSTRING (newstr)->data + from + produced,
5566            shrinked_bytes - from);
5567
5568   free_conversion_buffer (&buf);
5569   coding_free_composition_data (coding);
5570
5571   return newstr;
5572 }
5573
5574 \f
5575 #ifdef emacs
5576 /*** 8. Emacs Lisp library functions ***/
5577
5578 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5579   "Return t if OBJECT is nil or a coding-system.\n\
5580 See the documentation of `make-coding-system' for information\n\
5581 about coding-system objects.")
5582   (obj)
5583      Lisp_Object obj;
5584 {
5585   if (NILP (obj))
5586     return Qt;
5587   if (!SYMBOLP (obj))
5588     return Qnil;
5589   /* Get coding-spec vector for OBJ.  */
5590   obj = Fget (obj, Qcoding_system);
5591   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5592           ? Qt : Qnil);
5593 }
5594
5595 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5596        Sread_non_nil_coding_system, 1, 1, 0,
5597   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5598   (prompt)
5599      Lisp_Object prompt;
5600 {
5601   Lisp_Object val;
5602   do
5603     {
5604       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5605                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5606     }
5607   while (XSTRING (val)->size == 0);
5608   return (Fintern (val, Qnil));
5609 }
5610
5611 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5612   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5613 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5614   (prompt, default_coding_system)
5615      Lisp_Object prompt, default_coding_system;
5616 {
5617   Lisp_Object val;
5618   if (SYMBOLP (default_coding_system))
5619     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5620   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5621                           Qt, Qnil, Qcoding_system_history,
5622                           default_coding_system, Qnil);
5623   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5624 }
5625
5626 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5627        1, 1, 0,
5628   "Check validity of CODING-SYSTEM.\n\
5629 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5630 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5631 The value of property should be a vector of length 5.")
5632   (coding_system)
5633      Lisp_Object coding_system;
5634 {
5635   CHECK_SYMBOL (coding_system, 0);
5636   if (!NILP (Fcoding_system_p (coding_system)))
5637     return coding_system;
5638   while (1)
5639     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5640 }
5641 \f
5642 Lisp_Object
5643 detect_coding_system (src, src_bytes, highest, multibytep)
5644      unsigned char *src;
5645      int src_bytes, highest;
5646      int multibytep;
5647 {
5648   int coding_mask, eol_type;
5649   Lisp_Object val, tmp;
5650   int dummy;
5651
5652   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
5653   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5654   if (eol_type == CODING_EOL_INCONSISTENT)
5655     eol_type = CODING_EOL_UNDECIDED;
5656
5657   if (!coding_mask)
5658     {
5659       val = Qundecided;
5660       if (eol_type != CODING_EOL_UNDECIDED)
5661         {
5662           Lisp_Object val2;
5663           val2 = Fget (Qundecided, Qeol_type);
5664           if (VECTORP (val2))
5665             val = XVECTOR (val2)->contents[eol_type];
5666         }
5667       return (highest ? val : Fcons (val, Qnil));
5668     }
5669
5670   /* At first, gather possible coding systems in VAL.  */
5671   val = Qnil;
5672   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5673     {
5674       Lisp_Object category_val, category_index;
5675
5676       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5677       category_val = Fsymbol_value (XCAR (tmp));
5678       if (!NILP (category_val)
5679           && NATNUMP (category_index)
5680           && (coding_mask & (1 << XFASTINT (category_index))))
5681         {
5682           val = Fcons (category_val, val);
5683           if (highest)
5684             break;
5685         }
5686     }
5687   if (!highest)
5688     val = Fnreverse (val);
5689
5690   /* Then, replace the elements with subsidiary coding systems.  */
5691   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5692     {
5693       if (eol_type != CODING_EOL_UNDECIDED
5694           && eol_type != CODING_EOL_INCONSISTENT)
5695         {
5696           Lisp_Object eol;
5697           eol = Fget (XCAR (tmp), Qeol_type);
5698           if (VECTORP (eol))
5699             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5700         }
5701     }
5702   return (highest ? XCAR (val) : val);
5703 }
5704
5705 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5706        2, 3, 0,
5707   "Detect coding system of the text in the region between START and END.\n\
5708 Return a list of possible coding systems ordered by priority.\n\
5709 \n\
5710 If only ASCII characters are found, it returns a list of single element\n\
5711 `undecided' or its subsidiary coding system according to a detected\n\
5712 end-of-line format.\n\
5713 \n\
5714 If optional argument HIGHEST is non-nil, return the coding system of\n\
5715 highest priority.")
5716   (start, end, highest)
5717      Lisp_Object start, end, highest;
5718 {
5719   int from, to;
5720   int from_byte, to_byte;
5721
5722   CHECK_NUMBER_COERCE_MARKER (start, 0);
5723   CHECK_NUMBER_COERCE_MARKER (end, 1);
5724
5725   validate_region (&start, &end);
5726   from = XINT (start), to = XINT (end);
5727   from_byte = CHAR_TO_BYTE (from);
5728   to_byte = CHAR_TO_BYTE (to);
5729
5730   if (from < GPT && to >= GPT)
5731     move_gap_both (to, to_byte);
5732
5733   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5734                                to_byte - from_byte,
5735                                !NILP (highest),
5736                                !NILP (current_buffer
5737                                       ->enable_multibyte_characters));
5738 }
5739
5740 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5741        1, 2, 0,
5742   "Detect coding system of the text in STRING.\n\
5743 Return a list of possible coding systems ordered by priority.\n\
5744 \n\
5745 If only ASCII characters are found, it returns a list of single element\n\
5746 `undecided' or its subsidiary coding system according to a detected\n\
5747 end-of-line format.\n\
5748 \n\
5749 If optional argument HIGHEST is non-nil, return the coding system of\n\
5750 highest priority.")
5751   (string, highest)
5752      Lisp_Object string, highest;
5753 {
5754   CHECK_STRING (string, 0);
5755
5756   return detect_coding_system (XSTRING (string)->data,
5757                                STRING_BYTES (XSTRING (string)),
5758                                !NILP (highest),
5759                                STRING_MULTIBYTE (string));
5760 }
5761
5762 /* Return an intersection of lists L1 and L2.  */
5763
5764 static Lisp_Object
5765 intersection (l1, l2)
5766      Lisp_Object l1, l2;
5767 {
5768   Lisp_Object val;
5769
5770   for (val = Qnil; CONSP (l1); l1 = XCDR (l1))
5771     {
5772       if (!NILP (Fmemq (XCAR (l1), l2)))
5773         val = Fcons (XCAR (l1), val);
5774     }
5775   return val;
5776 }
5777
5778
5779 /*  Subroutine for Fsafe_coding_systems_region_internal.
5780
5781     Return a list of coding systems that safely encode the multibyte
5782     text between P and PEND.  SAFE_CODINGS, if non-nil, is a list of
5783     possible coding systems.  If it is nil, it means that we have not
5784     yet found any coding systems.
5785
5786     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
5787     element of WORK_TABLE is set to t once the element is looked up.
5788
5789     If a non-ASCII single byte char is found, set
5790     *single_byte_char_found to 1.  */
5791
5792 static Lisp_Object
5793 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
5794      unsigned char *p, *pend;
5795      Lisp_Object safe_codings, work_table;
5796      int *single_byte_char_found;
5797 {
5798   int c, len, idx;
5799   Lisp_Object val;
5800
5801   while (p < pend)
5802     {
5803       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
5804       p += len;
5805       if (ASCII_BYTE_P (c))
5806         /* We can ignore ASCII characters here.  */
5807         continue;
5808       if (SINGLE_BYTE_CHAR_P (c))
5809         *single_byte_char_found = 1;
5810       if (NILP (safe_codings))
5811         continue;
5812       /* Check the safe coding systems for C.  */
5813       val = char_table_ref_and_index (work_table, c, &idx);
5814       if (EQ (val, Qt))
5815         /* This element was already checked.  Ignore it.  */
5816         continue;
5817       /* Remember that we checked this element.  */
5818       CHAR_TABLE_SET (work_table, make_number (idx), Qt);
5819
5820       /* If there are some safe coding systems for C and we have
5821          already found the other set of coding systems for the
5822          different characters, get the intersection of them.  */
5823       if (!EQ (safe_codings, Qt) && !NILP (val))
5824         val = intersection (safe_codings, val);
5825       safe_codings = val;
5826     }
5827   return safe_codings;
5828 }
5829
5830
5831 /* Return a list of coding systems that safely encode the text between
5832    START and END.  If the text contains only ASCII or is unibyte,
5833    return t.  */
5834
5835 DEFUN ("find-coding-systems-region-internal",
5836        Ffind_coding_systems_region_internal,
5837        Sfind_coding_systems_region_internal, 2, 2, 0,
5838   "Internal use only.")
5839   (start, end)
5840      Lisp_Object start, end;
5841 {
5842   Lisp_Object work_table, safe_codings;
5843   int non_ascii_p = 0;
5844   int single_byte_char_found = 0;
5845   unsigned char *p1, *p1end, *p2, *p2end, *p;
5846   Lisp_Object args[2];
5847
5848   if (STRINGP (start))
5849     {
5850       if (!STRING_MULTIBYTE (start))
5851         return Qt;
5852       p1 = XSTRING (start)->data, p1end = p1 + STRING_BYTES (XSTRING (start));
5853       p2 = p2end = p1end;
5854       if (XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
5855         non_ascii_p = 1;
5856     }
5857   else
5858     {
5859       int from, to, stop;
5860
5861       CHECK_NUMBER_COERCE_MARKER (start, 0);
5862       CHECK_NUMBER_COERCE_MARKER (end, 1);
5863       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
5864         args_out_of_range (start, end);
5865       if (NILP (current_buffer->enable_multibyte_characters))
5866         return Qt;
5867       from = CHAR_TO_BYTE (XINT (start));
5868       to = CHAR_TO_BYTE (XINT (end));
5869       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
5870       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
5871       if (stop == to)
5872         p2 = p2end = p1end;
5873       else
5874         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
5875       if (XINT (end) - XINT (start) != to - from)
5876         non_ascii_p = 1;
5877     }
5878
5879   if (!non_ascii_p)
5880     {
5881       /* We are sure that the text contains no multibyte character.
5882          Check if it contains eight-bit-graphic.  */
5883       p = p1;
5884       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
5885       if (p == p1end)
5886         {
5887           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
5888           if (p == p2end)
5889             return Qt;
5890         }
5891     }
5892
5893   /* The text contains non-ASCII characters.  */
5894   work_table = Fcopy_sequence (Vchar_coding_system_table);
5895   safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
5896                                     &single_byte_char_found);
5897   if (p2 < p2end)
5898     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
5899                                       &single_byte_char_found);
5900
5901   if (!single_byte_char_found)
5902     {
5903       /* Append generic coding systems.  */
5904       Lisp_Object args[2];
5905       args[0] = safe_codings;
5906       args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
5907                                         make_number (0));
5908       safe_codings = Fappend (2, args);
5909     }
5910   else
5911     safe_codings = Fcons (Qraw_text,
5912                           Fcons (Qemacs_mule,
5913                                  Fcons (Qno_conversion, safe_codings)));
5914   return safe_codings;
5915 }
5916
5917
5918 Lisp_Object
5919 code_convert_region1 (start, end, coding_system, encodep)
5920      Lisp_Object start, end, coding_system;
5921      int encodep;
5922 {
5923   struct coding_system coding;
5924   int from, to, len;
5925
5926   CHECK_NUMBER_COERCE_MARKER (start, 0);
5927   CHECK_NUMBER_COERCE_MARKER (end, 1);
5928   CHECK_SYMBOL (coding_system, 2);
5929
5930   validate_region (&start, &end);
5931   from = XFASTINT (start);
5932   to = XFASTINT (end);
5933
5934   if (NILP (coding_system))
5935     return make_number (to - from);
5936
5937   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5938     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5939
5940   coding.mode |= CODING_MODE_LAST_BLOCK;
5941   coding.src_multibyte = coding.dst_multibyte
5942     = !NILP (current_buffer->enable_multibyte_characters);
5943   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5944                        &coding, encodep, 1);
5945   Vlast_coding_system_used = coding.symbol;
5946   return make_number (coding.produced_char);
5947 }
5948
5949 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5950        3, 3, "r\nzCoding system: ",
5951   "Decode the current region by specified coding system.\n\
5952 When called from a program, takes three arguments:\n\
5953 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5954 This function sets `last-coding-system-used' to the precise coding system\n\
5955 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5956 not fully specified.)\n\
5957 It returns the length of the decoded text.")
5958   (start, end, coding_system)
5959      Lisp_Object start, end, coding_system;
5960 {
5961   return code_convert_region1 (start, end, coding_system, 0);
5962 }
5963
5964 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5965        3, 3, "r\nzCoding system: ",
5966   "Encode the current region by specified coding system.\n\
5967 When called from a program, takes three arguments:\n\
5968 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5969 This function sets `last-coding-system-used' to the precise coding system\n\
5970 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5971 not fully specified.)\n\
5972 It returns the length of the encoded text.")
5973   (start, end, coding_system)
5974      Lisp_Object start, end, coding_system;
5975 {
5976   return code_convert_region1 (start, end, coding_system, 1);
5977 }
5978
5979 Lisp_Object
5980 code_convert_string1 (string, coding_system, nocopy, encodep)
5981      Lisp_Object string, coding_system, nocopy;
5982      int encodep;
5983 {
5984   struct coding_system coding;
5985
5986   CHECK_STRING (string, 0);
5987   CHECK_SYMBOL (coding_system, 1);
5988
5989   if (NILP (coding_system))
5990     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5991
5992   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5993     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5994
5995   coding.mode |= CODING_MODE_LAST_BLOCK;
5996   string = (encodep
5997             ? encode_coding_string (string, &coding, !NILP (nocopy))
5998             : decode_coding_string (string, &coding, !NILP (nocopy)));
5999   Vlast_coding_system_used = coding.symbol;
6000
6001   return string;
6002 }
6003
6004 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6005        2, 3, 0,
6006   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
6007 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
6008 if the decoding operation is trivial.\n\
6009 This function sets `last-coding-system-used' to the precise coding system\n\
6010 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
6011 not fully specified.)")
6012   (string, coding_system, nocopy)
6013      Lisp_Object string, coding_system, nocopy;
6014 {
6015   return code_convert_string1 (string, coding_system, nocopy, 0);
6016 }
6017
6018 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6019        2, 3, 0,
6020   "Encode STRING to CODING-SYSTEM, and return the result.\n\
6021 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
6022 if the encoding operation is trivial.\n\
6023 This function sets `last-coding-system-used' to the precise coding system\n\
6024 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
6025 not fully specified.)")
6026   (string, coding_system, nocopy)
6027      Lisp_Object string, coding_system, nocopy;
6028 {
6029   return code_convert_string1 (string, coding_system, nocopy, 1);
6030 }
6031
6032 /* Encode or decode STRING according to CODING_SYSTEM.
6033    Do not set Vlast_coding_system_used.
6034
6035    This function is called only from macros DECODE_FILE and
6036    ENCODE_FILE, thus we ignore character composition.  */
6037
6038 Lisp_Object
6039 code_convert_string_norecord (string, coding_system, encodep)
6040      Lisp_Object string, coding_system;
6041      int encodep;
6042 {
6043   struct coding_system coding;
6044
6045   CHECK_STRING (string, 0);
6046   CHECK_SYMBOL (coding_system, 1);
6047
6048   if (NILP (coding_system))
6049     return string;
6050
6051   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6052     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
6053
6054   coding.composing = COMPOSITION_DISABLED;
6055   coding.mode |= CODING_MODE_LAST_BLOCK;
6056   return (encodep
6057           ? encode_coding_string (string, &coding, 1)
6058           : decode_coding_string (string, &coding, 1));
6059 }
6060 \f
6061 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6062   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
6063 Return the corresponding character.")
6064   (code)
6065      Lisp_Object code;
6066 {
6067   unsigned char c1, c2, s1, s2;
6068   Lisp_Object val;
6069
6070   CHECK_NUMBER (code, 0);
6071   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6072   if (s1 == 0)
6073     {
6074       if (s2 < 0x80)
6075         XSETFASTINT (val, s2);
6076       else if (s2 >= 0xA0 || s2 <= 0xDF)
6077         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
6078       else
6079         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6080     }
6081   else
6082     {
6083       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
6084           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
6085         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6086       DECODE_SJIS (s1, s2, c1, c2);
6087       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
6088     }
6089   return val;
6090 }
6091
6092 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6093   "Encode a Japanese character CHAR to shift_jis encoding.\n\
6094 Return the corresponding code in SJIS.")
6095   (ch)
6096      Lisp_Object ch;
6097 {
6098   int charset, c1, c2, s1, s2;
6099   Lisp_Object val;
6100
6101   CHECK_NUMBER (ch, 0);
6102   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6103   if (charset == CHARSET_ASCII)
6104     {
6105       val = ch;
6106     }
6107   else if (charset == charset_jisx0208
6108            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
6109     {
6110       ENCODE_SJIS (c1, c2, s1, s2);
6111       XSETFASTINT (val, (s1 << 8) | s2);
6112     }
6113   else if (charset == charset_katakana_jisx0201
6114            && c1 > 0x20 && c2 < 0xE0)
6115     {
6116       XSETFASTINT (val, c1 | 0x80);
6117     }
6118   else
6119     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
6120   return val;
6121 }
6122
6123 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6124   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
6125 Return the corresponding character.")
6126   (code)
6127      Lisp_Object code;
6128 {
6129   int charset;
6130   unsigned char b1, b2, c1, c2;
6131   Lisp_Object val;
6132
6133   CHECK_NUMBER (code, 0);
6134   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
6135   if (b1 == 0)
6136     {
6137       if (b2 >= 0x80)
6138         error ("Invalid BIG5 code: %x", XFASTINT (code));
6139       val = code;
6140     }
6141   else
6142     {
6143       if ((b1 < 0xA1 || b1 > 0xFE)
6144           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
6145         error ("Invalid BIG5 code: %x", XFASTINT (code));
6146       DECODE_BIG5 (b1, b2, charset, c1, c2);
6147       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
6148     }
6149   return val;
6150 }
6151
6152 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6153   "Encode the Big5 character CHAR to BIG5 coding system.\n\
6154 Return the corresponding character code in Big5.")
6155   (ch)
6156      Lisp_Object ch;
6157 {
6158   int charset, c1, c2, b1, b2;
6159   Lisp_Object val;
6160
6161   CHECK_NUMBER (ch, 0);
6162   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6163   if (charset == CHARSET_ASCII)
6164     {
6165       val = ch;
6166     }
6167   else if ((charset == charset_big5_1
6168             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
6169            || (charset == charset_big5_2
6170                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
6171     {
6172       ENCODE_BIG5 (charset, c1, c2, b1, b2);
6173       XSETFASTINT (val, (b1 << 8) | b2);
6174     }
6175   else
6176     error ("Can't encode to Big5: %d", XFASTINT (ch));
6177   return val;
6178 }
6179 \f
6180 DEFUN ("set-terminal-coding-system-internal",
6181        Fset_terminal_coding_system_internal,
6182        Sset_terminal_coding_system_internal, 1, 1, 0, "")
6183   (coding_system)
6184      Lisp_Object coding_system;
6185 {
6186   CHECK_SYMBOL (coding_system, 0);
6187   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6188   /* We had better not send unsafe characters to terminal.  */
6189   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
6190   /* Characer composition should be disabled.  */
6191   terminal_coding.composing = COMPOSITION_DISABLED;
6192   terminal_coding.src_multibyte = 1;
6193   terminal_coding.dst_multibyte = 0;
6194   return Qnil;
6195 }
6196
6197 DEFUN ("set-safe-terminal-coding-system-internal",
6198        Fset_safe_terminal_coding_system_internal,
6199        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
6200   (coding_system)
6201      Lisp_Object coding_system;
6202 {
6203   CHECK_SYMBOL (coding_system, 0);
6204   setup_coding_system (Fcheck_coding_system (coding_system),
6205                        &safe_terminal_coding);
6206   /* Characer composition should be disabled.  */
6207   safe_terminal_coding.composing = COMPOSITION_DISABLED;
6208   safe_terminal_coding.src_multibyte = 1;
6209   safe_terminal_coding.dst_multibyte = 0;
6210   return Qnil;
6211 }
6212
6213 DEFUN ("terminal-coding-system",
6214        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6215   "Return coding system specified for terminal output.")
6216   ()
6217 {
6218   return terminal_coding.symbol;
6219 }
6220
6221 DEFUN ("set-keyboard-coding-system-internal",
6222        Fset_keyboard_coding_system_internal,
6223        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
6224   (coding_system)
6225      Lisp_Object coding_system;
6226 {
6227   CHECK_SYMBOL (coding_system, 0);
6228   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
6229   /* Characer composition should be disabled.  */
6230   keyboard_coding.composing = COMPOSITION_DISABLED;
6231   return Qnil;
6232 }
6233
6234 DEFUN ("keyboard-coding-system",
6235        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6236   "Return coding system specified for decoding keyboard input.")
6237   ()
6238 {
6239   return keyboard_coding.symbol;
6240 }
6241
6242 \f
6243 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6244        Sfind_operation_coding_system,  1, MANY, 0,
6245   "Choose a coding system for an operation based on the target name.\n\
6246 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
6247 DECODING-SYSTEM is the coding system to use for decoding\n\
6248 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
6249 for encoding (in case OPERATION does encoding).\n\
6250 \n\
6251 The first argument OPERATION specifies an I/O primitive:\n\
6252   For file I/O, `insert-file-contents' or `write-region'.\n\
6253   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
6254   For network I/O, `open-network-stream'.\n\
6255 \n\
6256 The remaining arguments should be the same arguments that were passed\n\
6257 to the primitive.  Depending on which primitive, one of those arguments\n\
6258 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
6259 whichever argument specifies the file name is TARGET.\n\
6260 \n\
6261 TARGET has a meaning which depends on OPERATION:\n\
6262   For file I/O, TARGET is a file name.\n\
6263   For process I/O, TARGET is a process name.\n\
6264   For network I/O, TARGET is a service name or a port number\n\
6265 \n\
6266 This function looks up what specified for TARGET in,\n\
6267 `file-coding-system-alist', `process-coding-system-alist',\n\
6268 or `network-coding-system-alist' depending on OPERATION.\n\
6269 They may specify a coding system, a cons of coding systems,\n\
6270 or a function symbol to call.\n\
6271 In the last case, we call the function with one argument,\n\
6272 which is a list of all the arguments given to this function.")
6273   (nargs, args)
6274      int nargs;
6275      Lisp_Object *args;
6276 {
6277   Lisp_Object operation, target_idx, target, val;
6278   register Lisp_Object chain;
6279
6280   if (nargs < 2)
6281     error ("Too few arguments");
6282   operation = args[0];
6283   if (!SYMBOLP (operation)
6284       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
6285     error ("Invalid first arguement");
6286   if (nargs < 1 + XINT (target_idx))
6287     error ("Too few arguments for operation: %s",
6288            XSYMBOL (operation)->name->data);
6289   target = args[XINT (target_idx) + 1];
6290   if (!(STRINGP (target)
6291         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
6292     error ("Invalid %dth argument", XINT (target_idx) + 1);
6293
6294   chain = ((EQ (operation, Qinsert_file_contents)
6295             || EQ (operation, Qwrite_region))
6296            ? Vfile_coding_system_alist
6297            : (EQ (operation, Qopen_network_stream)
6298               ? Vnetwork_coding_system_alist
6299               : Vprocess_coding_system_alist));
6300   if (NILP (chain))
6301     return Qnil;
6302
6303   for (; CONSP (chain); chain = XCDR (chain))
6304     {
6305       Lisp_Object elt;
6306       elt = XCAR (chain);
6307
6308       if (CONSP (elt)
6309           && ((STRINGP (target)
6310                && STRINGP (XCAR (elt))
6311                && fast_string_match (XCAR (elt), target) >= 0)
6312               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6313         {
6314           val = XCDR (elt);
6315           /* Here, if VAL is both a valid coding system and a valid
6316              function symbol, we return VAL as a coding system.  */
6317           if (CONSP (val))
6318             return val;
6319           if (! SYMBOLP (val))
6320             return Qnil;
6321           if (! NILP (Fcoding_system_p (val)))
6322             return Fcons (val, val);
6323           if (! NILP (Ffboundp (val)))
6324             {
6325               val = call1 (val, Flist (nargs, args));
6326               if (CONSP (val))
6327                 return val;
6328               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
6329                 return Fcons (val, val);
6330             }
6331           return Qnil;
6332         }
6333     }
6334   return Qnil;
6335 }
6336
6337 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
6338        Supdate_coding_systems_internal, 0, 0, 0,
6339   "Update internal database for ISO2022 and CCL based coding systems.\n\
6340 When values of any coding categories are changed, you must\n\
6341 call this function")
6342   ()
6343 {
6344   int i;
6345
6346   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
6347     {
6348       Lisp_Object val;
6349
6350       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
6351       if (!NILP (val))
6352         {
6353           if (! coding_system_table[i])
6354             coding_system_table[i] = ((struct coding_system *)
6355                                       xmalloc (sizeof (struct coding_system)));
6356           setup_coding_system (val, coding_system_table[i]);
6357         }
6358       else if (coding_system_table[i])
6359         {
6360           xfree (coding_system_table[i]);
6361           coding_system_table[i] = NULL;
6362         }
6363     }
6364
6365   return Qnil;
6366 }
6367
6368 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
6369        Sset_coding_priority_internal, 0, 0, 0,
6370   "Update internal database for the current value of `coding-category-list'.\n\
6371 This function is internal use only.")
6372   ()
6373 {
6374   int i = 0, idx;
6375   Lisp_Object val;
6376
6377   val = Vcoding_category_list;
6378
6379   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
6380     {
6381       if (! SYMBOLP (XCAR (val)))
6382         break;
6383       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
6384       if (idx >= CODING_CATEGORY_IDX_MAX)
6385         break;
6386       coding_priorities[i++] = (1 << idx);
6387       val = XCDR (val);
6388     }
6389   /* If coding-category-list is valid and contains all coding
6390      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
6391      the following code saves Emacs from crashing.  */
6392   while (i < CODING_CATEGORY_IDX_MAX)
6393     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6394
6395   return Qnil;
6396 }
6397
6398 #endif /* emacs */
6399
6400 \f
6401 /*** 9. Post-amble ***/
6402
6403 void
6404 init_coding_once ()
6405 {
6406   int i;
6407
6408   /* Emacs' internal format specific initialize routine.  */
6409   for (i = 0; i <= 0x20; i++)
6410     emacs_code_class[i] = EMACS_control_code;
6411   emacs_code_class[0x0A] = EMACS_linefeed_code;
6412   emacs_code_class[0x0D] = EMACS_carriage_return_code;
6413   for (i = 0x21 ; i < 0x7F; i++)
6414     emacs_code_class[i] = EMACS_ascii_code;
6415   emacs_code_class[0x7F] = EMACS_control_code;
6416   for (i = 0x80; i < 0xFF; i++)
6417     emacs_code_class[i] = EMACS_invalid_code;
6418   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6419   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6420   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6421   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6422
6423   /* ISO2022 specific initialize routine.  */
6424   for (i = 0; i < 0x20; i++)
6425     iso_code_class[i] = ISO_control_0;
6426   for (i = 0x21; i < 0x7F; i++)
6427     iso_code_class[i] = ISO_graphic_plane_0;
6428   for (i = 0x80; i < 0xA0; i++)
6429     iso_code_class[i] = ISO_control_1;
6430   for (i = 0xA1; i < 0xFF; i++)
6431     iso_code_class[i] = ISO_graphic_plane_1;
6432   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6433   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6434   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6435   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6436   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6437   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6438   iso_code_class[ISO_CODE_ESC] = ISO_escape;
6439   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6440   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6441   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6442
6443   setup_coding_system (Qnil, &keyboard_coding);
6444   setup_coding_system (Qnil, &terminal_coding);
6445   setup_coding_system (Qnil, &safe_terminal_coding);
6446   setup_coding_system (Qnil, &default_buffer_file_coding);
6447
6448   bzero (coding_system_table, sizeof coding_system_table);
6449
6450   bzero (ascii_skip_code, sizeof ascii_skip_code);
6451   for (i = 0; i < 128; i++)
6452     ascii_skip_code[i] = 1;
6453
6454 #if defined (MSDOS) || defined (WINDOWSNT)
6455   system_eol_type = CODING_EOL_CRLF;
6456 #else
6457   system_eol_type = CODING_EOL_LF;
6458 #endif
6459
6460   inhibit_pre_post_conversion = 0;
6461 }
6462
6463 #ifdef emacs
6464
6465 void
6466 syms_of_coding ()
6467 {
6468   Qtarget_idx = intern ("target-idx");
6469   staticpro (&Qtarget_idx);
6470
6471   Qcoding_system_history = intern ("coding-system-history");
6472   staticpro (&Qcoding_system_history);
6473   Fset (Qcoding_system_history, Qnil);
6474
6475   /* Target FILENAME is the first argument.  */
6476   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6477   /* Target FILENAME is the third argument.  */
6478   Fput (Qwrite_region, Qtarget_idx, make_number (2));
6479
6480   Qcall_process = intern ("call-process");
6481   staticpro (&Qcall_process);
6482   /* Target PROGRAM is the first argument.  */
6483   Fput (Qcall_process, Qtarget_idx, make_number (0));
6484
6485   Qcall_process_region = intern ("call-process-region");
6486   staticpro (&Qcall_process_region);
6487   /* Target PROGRAM is the third argument.  */
6488   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6489
6490   Qstart_process = intern ("start-process");
6491   staticpro (&Qstart_process);
6492   /* Target PROGRAM is the third argument.  */
6493   Fput (Qstart_process, Qtarget_idx, make_number (2));
6494
6495   Qopen_network_stream = intern ("open-network-stream");
6496   staticpro (&Qopen_network_stream);
6497   /* Target SERVICE is the fourth argument.  */
6498   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6499
6500   Qcoding_system = intern ("coding-system");
6501   staticpro (&Qcoding_system);
6502
6503   Qeol_type = intern ("eol-type");
6504   staticpro (&Qeol_type);
6505
6506   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6507   staticpro (&Qbuffer_file_coding_system);
6508
6509   Qpost_read_conversion = intern ("post-read-conversion");
6510   staticpro (&Qpost_read_conversion);
6511
6512   Qpre_write_conversion = intern ("pre-write-conversion");
6513   staticpro (&Qpre_write_conversion);
6514
6515   Qno_conversion = intern ("no-conversion");
6516   staticpro (&Qno_conversion);
6517
6518   Qundecided = intern ("undecided");
6519   staticpro (&Qundecided);
6520
6521   Qcoding_system_p = intern ("coding-system-p");
6522   staticpro (&Qcoding_system_p);
6523
6524   Qcoding_system_error = intern ("coding-system-error");
6525   staticpro (&Qcoding_system_error);
6526
6527   Fput (Qcoding_system_error, Qerror_conditions,
6528         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6529   Fput (Qcoding_system_error, Qerror_message,
6530         build_string ("Invalid coding system"));
6531
6532   Qcoding_category = intern ("coding-category");
6533   staticpro (&Qcoding_category);
6534   Qcoding_category_index = intern ("coding-category-index");
6535   staticpro (&Qcoding_category_index);
6536
6537   Vcoding_category_table
6538     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6539   staticpro (&Vcoding_category_table);
6540   {
6541     int i;
6542     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6543       {
6544         XVECTOR (Vcoding_category_table)->contents[i]
6545           = intern (coding_category_name[i]);
6546         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6547               Qcoding_category_index, make_number (i));
6548       }
6549   }
6550
6551   Qtranslation_table = intern ("translation-table");
6552   staticpro (&Qtranslation_table);
6553   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6554
6555   Qtranslation_table_id = intern ("translation-table-id");
6556   staticpro (&Qtranslation_table_id);
6557
6558   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6559   staticpro (&Qtranslation_table_for_decode);
6560
6561   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6562   staticpro (&Qtranslation_table_for_encode);
6563
6564   Qsafe_chars = intern ("safe-chars");
6565   staticpro (&Qsafe_chars);
6566
6567   Qchar_coding_system = intern ("char-coding-system");
6568   staticpro (&Qchar_coding_system);
6569
6570   /* Intern this now in case it isn't already done.
6571      Setting this variable twice is harmless.
6572      But don't staticpro it here--that is done in alloc.c.  */
6573   Qchar_table_extra_slots = intern ("char-table-extra-slots");
6574   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
6575   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (1));
6576
6577   Qvalid_codes = intern ("valid-codes");
6578   staticpro (&Qvalid_codes);
6579
6580   Qemacs_mule = intern ("emacs-mule");
6581   staticpro (&Qemacs_mule);
6582
6583   Qraw_text = intern ("raw-text");
6584   staticpro (&Qraw_text);
6585
6586   defsubr (&Scoding_system_p);
6587   defsubr (&Sread_coding_system);
6588   defsubr (&Sread_non_nil_coding_system);
6589   defsubr (&Scheck_coding_system);
6590   defsubr (&Sdetect_coding_region);
6591   defsubr (&Sdetect_coding_string);
6592   defsubr (&Sfind_coding_systems_region_internal);
6593   defsubr (&Sdecode_coding_region);
6594   defsubr (&Sencode_coding_region);
6595   defsubr (&Sdecode_coding_string);
6596   defsubr (&Sencode_coding_string);
6597   defsubr (&Sdecode_sjis_char);
6598   defsubr (&Sencode_sjis_char);
6599   defsubr (&Sdecode_big5_char);
6600   defsubr (&Sencode_big5_char);
6601   defsubr (&Sset_terminal_coding_system_internal);
6602   defsubr (&Sset_safe_terminal_coding_system_internal);
6603   defsubr (&Sterminal_coding_system);
6604   defsubr (&Sset_keyboard_coding_system_internal);
6605   defsubr (&Skeyboard_coding_system);
6606   defsubr (&Sfind_operation_coding_system);
6607   defsubr (&Supdate_coding_systems_internal);
6608   defsubr (&Sset_coding_priority_internal);
6609
6610   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6611     "List of coding systems.\n\
6612 \n\
6613 Do not alter the value of this variable manually.  This variable should be\n\
6614 updated by the functions `make-coding-system' and\n\
6615 `define-coding-system-alias'.");
6616   Vcoding_system_list = Qnil;
6617
6618   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6619     "Alist of coding system names.\n\
6620 Each element is one element list of coding system name.\n\
6621 This variable is given to `completing-read' as TABLE argument.\n\
6622 \n\
6623 Do not alter the value of this variable manually.  This variable should be\n\
6624 updated by the functions `make-coding-system' and\n\
6625 `define-coding-system-alias'.");
6626   Vcoding_system_alist = Qnil;
6627
6628   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6629     "List of coding-categories (symbols) ordered by priority.");
6630   {
6631     int i;
6632
6633     Vcoding_category_list = Qnil;
6634     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6635       Vcoding_category_list
6636         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6637                  Vcoding_category_list);
6638   }
6639
6640   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6641     "Specify the coding system for read operations.\n\
6642 It is useful to bind this variable with `let', but do not set it globally.\n\
6643 If the value is a coding system, it is used for decoding on read operation.\n\
6644 If not, an appropriate element is used from one of the coding system alists:\n\
6645 There are three such tables, `file-coding-system-alist',\n\
6646 `process-coding-system-alist', and `network-coding-system-alist'.");
6647   Vcoding_system_for_read = Qnil;
6648
6649   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6650     "Specify the coding system for write operations.\n\
6651 Programs bind this variable with `let', but you should not set it globally.\n\
6652 If the value is a coding system, it is used for encoding of output,\n\
6653 when writing it to a file and when sending it to a file or subprocess.\n\
6654 \n\
6655 If this does not specify a coding system, an appropriate element\n\
6656 is used from one of the coding system alists:\n\
6657 There are three such tables, `file-coding-system-alist',\n\
6658 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6659 For output to files, if the above procedure does not specify a coding system,\n\
6660 the value of `buffer-file-coding-system' is used.");
6661   Vcoding_system_for_write = Qnil;
6662
6663   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6664     "Coding system used in the latest file or process I/O.");
6665   Vlast_coding_system_used = Qnil;
6666
6667   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6668     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6669 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6670 such conversion.");
6671   inhibit_eol_conversion = 0;
6672
6673   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6674     "Non-nil means process buffer inherits coding system of process output.\n\
6675 Bind it to t if the process output is to be treated as if it were a file\n\
6676 read from some filesystem.");
6677   inherit_process_coding_system = 0;
6678
6679   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6680     "Alist to decide a coding system to use for a file I/O operation.\n\
6681 The format is ((PATTERN . VAL) ...),\n\
6682 where PATTERN is a regular expression matching a file name,\n\
6683 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6684 If VAL is a coding system, it is used for both decoding and encoding\n\
6685 the file contents.\n\
6686 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6687 and the cdr part is used for encoding.\n\
6688 If VAL is a function symbol, the function must return a coding system\n\
6689 or a cons of coding systems which are used as above.\n\
6690 \n\
6691 See also the function `find-operation-coding-system'\n\
6692 and the variable `auto-coding-alist'.");
6693   Vfile_coding_system_alist = Qnil;
6694
6695   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6696     "Alist to decide a coding system to use for a process I/O operation.\n\
6697 The format is ((PATTERN . VAL) ...),\n\
6698 where PATTERN is a regular expression matching a program name,\n\
6699 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6700 If VAL is a coding system, it is used for both decoding what received\n\
6701 from the program and encoding what sent to the program.\n\
6702 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6703 and the cdr part is used for encoding.\n\
6704 If VAL is a function symbol, the function must return a coding system\n\
6705 or a cons of coding systems which are used as above.\n\
6706 \n\
6707 See also the function `find-operation-coding-system'.");
6708   Vprocess_coding_system_alist = Qnil;
6709
6710   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6711     "Alist to decide a coding system to use for a network I/O operation.\n\
6712 The format is ((PATTERN . VAL) ...),\n\
6713 where PATTERN is a regular expression matching a network service name\n\
6714 or is a port number to connect to,\n\
6715 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6716 If VAL is a coding system, it is used for both decoding what received\n\
6717 from the network stream and encoding what sent to the network stream.\n\
6718 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6719 and the cdr part is used for encoding.\n\
6720 If VAL is a function symbol, the function must return a coding system\n\
6721 or a cons of coding systems which are used as above.\n\
6722 \n\
6723 See also the function `find-operation-coding-system'.");
6724   Vnetwork_coding_system_alist = Qnil;
6725
6726   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6727     "Coding system to use with system messages.");
6728   Vlocale_coding_system = Qnil;
6729
6730   /* The eol mnemonics are reset in startup.el system-dependently.  */
6731   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6732     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6733   eol_mnemonic_unix = build_string (":");
6734
6735   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6736     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6737   eol_mnemonic_dos = build_string ("\\");
6738
6739   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6740     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6741   eol_mnemonic_mac = build_string ("/");
6742
6743   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6744     "*String displayed in mode line when end-of-line format is not yet determined.");
6745   eol_mnemonic_undecided = build_string (":");
6746
6747   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6748     "*Non-nil enables character translation while encoding and decoding.");
6749   Venable_character_translation = Qt;
6750
6751   DEFVAR_LISP ("standard-translation-table-for-decode",
6752     &Vstandard_translation_table_for_decode,
6753     "Table for translating characters while decoding.");
6754   Vstandard_translation_table_for_decode = Qnil;
6755
6756   DEFVAR_LISP ("standard-translation-table-for-encode",
6757     &Vstandard_translation_table_for_encode,
6758     "Table for translationg characters while encoding.");
6759   Vstandard_translation_table_for_encode = Qnil;
6760
6761   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6762     "Alist of charsets vs revision numbers.\n\
6763 While encoding, if a charset (car part of an element) is found,\n\
6764 designate it with the escape sequence identifing revision (cdr part of the element).");
6765   Vcharset_revision_alist = Qnil;
6766
6767   DEFVAR_LISP ("default-process-coding-system",
6768                &Vdefault_process_coding_system,
6769     "Cons of coding systems used for process I/O by default.\n\
6770 The car part is used for decoding a process output,\n\
6771 the cdr part is used for encoding a text to be sent to a process.");
6772   Vdefault_process_coding_system = Qnil;
6773
6774   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6775     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6776 This is a vector of length 256.\n\
6777 If Nth element is non-nil, the existence of code N in a file\n\
6778 \(or output of subprocess) doesn't prevent it to be detected as\n\
6779 a coding system of ISO 2022 variant which has a flag\n\
6780 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6781 or reading output of a subprocess.\n\
6782 Only 128th through 159th elements has a meaning.");
6783   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6784
6785   DEFVAR_LISP ("select-safe-coding-system-function",
6786                &Vselect_safe_coding_system_function,
6787     "Function to call to select safe coding system for encoding a text.\n\
6788 \n\
6789 If set, this function is called to force a user to select a proper\n\
6790 coding system which can encode the text in the case that a default\n\
6791 coding system used in each operation can't encode the text.\n\
6792 \n\
6793 The default value is `select-safe-coding-system' (which see).");
6794   Vselect_safe_coding_system_function = Qnil;
6795
6796   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
6797     "Char-table containing safe coding systems of each characters.\n\
6798 Each element doesn't include such generic coding systems that can\n\
6799 encode any characters.   They are in the first extra slot.");
6800   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
6801
6802   DEFVAR_BOOL ("inhibit-iso-escape-detection",
6803                &inhibit_iso_escape_detection,
6804     "If non-nil, Emacs ignores ISO2022's escape sequence on code detection.\n\
6805 \n\
6806 By default, on reading a file, Emacs tries to detect how the text is\n\
6807 encoded.  This code detection is sensitive to escape sequences.  If\n\
6808 the sequence is valid as ISO2022, the code is determined as one of\n\
6809 the ISO2022 encodings, and the file is decoded by the corresponding\n\
6810 coding system (e.g. `iso-2022-7bit').\n\
6811 \n\
6812 However, there may be a case that you want to read escape sequences in\n\
6813 a file as is.  In such a case, you can set this variable to non-nil.\n\
6814 Then, as the code detection ignores any escape sequences, no file is\n\
6815 detected as encoded in some ISO2022 encoding.  The result is that all\n\
6816 escape sequences become visible in a buffer.\n\
6817 \n\
6818 The default value is nil, and it is strongly recommended not to change\n\
6819 it.  That is because many Emacs Lisp source files that contain\n\
6820 non-ASCII characters are encoded by the coding system `iso-2022-7bit'\n\
6821 in Emacs's distribution, and they won't be decoded correctly on\n\
6822 reading if you suppress escape sequence detection.\n\
6823 \n\
6824 The other way to read escape sequences in a file without decoding is\n\
6825 to explicitly specify some coding system that doesn't use ISO2022's\n\
6826 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].");
6827   inhibit_iso_escape_detection = 0;
6828 }
6829
6830 char *
6831 emacs_strerror (error_number)
6832      int error_number;
6833 {
6834   char *str;
6835
6836   synchronize_system_messages_locale ();
6837   str = strerror (error_number);
6838
6839   if (! NILP (Vlocale_coding_system))
6840     {
6841       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6842                                                       Vlocale_coding_system,
6843                                                       0);
6844       str = (char *) XSTRING (dec)->data;
6845     }
6846
6847   return str;
6848 }
6849
6850 #endif /* emacs */
6851