src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   0. General comments
  25   1. Preamble
  26   2. Emacs' internal format (emacs-mule) handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. CCL handlers
  30   6. End-of-line handlers
  31   7. C library functions
  32   8. Emacs Lisp library functions
  33   9. Post-amble
  34
  35 */
  36
  37 /*** 0. General comments ***/
  38
  39
  40 /*** GENERAL NOTE on CODING SYSTEM ***
  41
  42   Coding system is an encoding mechanism of one or more character
  43   sets.  Here's a list of coding systems which Emacs can handle.  When
  44   we say "decode", it means converting some other coding system to
  45   Emacs' internal format (emacs-internal), and when we say "encode",
  46   it means converting the coding system emacs-mule to some other
  47   coding system.
  48
  49   0. Emacs' internal format (emacs-mule)
  50
  51   Emacs itself holds a multi-lingual character in a buffer and a string
  52   in a special format.  Details are described in section 2.
  53
  54   1. ISO2022
  55
  56   The most famous coding system for multiple character sets.  X's
  57   Compound Text, various EUCs (Extended Unix Code), and coding
  58   systems used in Internet communication such as ISO-2022-JP are
  59   all variants of ISO2022.  Details are described in section 3.
  60
  61   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  62
  63   A coding system to encode character sets: ASCII, JISX0201, and
  64   JISX0208.  Widely used for PC's in Japan.  Details are described in
  65   section 4.
  66
  67   3. BIG5
  68
  69   A coding system to encode character sets: ASCII and Big5.  Widely
  70   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  71   described in section 4.  In this file, when we write "BIG5"
  72   (all uppercase), we mean the coding system, and when we write
  73   "Big5" (capitalized), we mean the character set.
  74
  75   4. Raw text
  76
  77   A coding system for a text containing random 8-bit code.  Emacs does
  78   no code conversion on such a text except for end-of-line format.
  79
  80   5. Other
  81
  82   If a user wants to read/write a text encoded in a coding system not
  83   listed above, he can supply a decoder and an encoder for it in CCL
  84   (Code Conversion Language) programs.  Emacs executes the CCL program
  85   while reading/writing.
  86
  87   Emacs represents a coding system by a Lisp symbol that has a property
  88   `coding-system'.  But, before actually using the coding system, the
  89   information about it is set in a structure of type `struct
  90   coding_system' for rapid processing.  See section 6 for more details.
  91
  92 */
  93
  94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  95
  96   How end-of-line of a text is encoded depends on a system.  For
  97   instance, Unix's format is just one byte of `line-feed' code,
  98   whereas DOS's format is two-byte sequence of `carriage-return' and
  99   `line-feed' codes.  MacOS's format is usually one byte of
 100   `carriage-return'.
 101
 102   Since text characters encoding and end-of-line encoding are
 103   independent, any coding system described above can take
 104   any format of end-of-line.  So, Emacs has information of format of
 105   end-of-line in each coding-system.  See section 6 for more details.
 106
 107 */
 108
 109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 110
 111   These functions check if a text between SRC and SRC_END is encoded
 112   in the coding system category XXX.  Each returns an integer value in
 113   which appropriate flag bits for the category XXX is set.  The flag
 114   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 115   template of these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 116   of the range 0x80..0x9F are in multibyte form.  */
 117 #if 0
 118 int
 119 detect_coding_emacs_mule (src, src_end, multibytep)
 120      unsigned char *src, *src_end;
 121      int multibytep;
 122 {
 123   ...
 124 }
 125 #endif
 126
 127 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 128
 129   These functions decode SRC_BYTES length of unibyte text at SOURCE
 130   encoded in CODING to Emacs' internal format.  The resulting
 131   multibyte text goes to a place pointed to by DESTINATION, the length
 132   of which should not exceed DST_BYTES.
 133
 134   These functions set the information of original and decoded texts in
 135   the members produced, produced_char, consumed, and consumed_char of
 136   the structure *CODING.  They also set the member result to one of
 137   CODING_FINISH_XXX indicating how the decoding finished.
 138
 139   DST_BYTES zero means that source area and destination area are
 140   overlapped, which means that we can produce a decoded text until it
 141   reaches at the head of not-yet-decoded source text.
 142
 143   Below is a template of these functions.  */
 144 #if 0
 145 static void
 146 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 147      struct coding_system *coding;
 148      unsigned char *source, *destination;
 149      int src_bytes, dst_bytes;
 150 {
 151   ...
 152 }
 153 #endif
 154
 155 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 156
 157   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 158   internal multibyte format to CODING.  The resulting unibyte text
 159   goes to a place pointed to by DESTINATION, the length of which
 160   should not exceed DST_BYTES.
 161
 162   These functions set the information of original and encoded texts in
 163   the members produced, produced_char, consumed, and consumed_char of
 164   the structure *CODING.  They also set the member result to one of
 165   CODING_FINISH_XXX indicating how the encoding finished.
 166
 167   DST_BYTES zero means that source area and destination area are
 168   overlapped, which means that we can produce a encoded text until it
 169   reaches at the head of not-yet-encoded source text.
 170
 171   Below is a template of these functions.  */
 172 #if 0
 173 static void
 174 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 175      struct coding_system *coding;
 176      unsigned char *source, *destination;
 177      int src_bytes, dst_bytes;
 178 {
 179   ...
 180 }
 181 #endif
 182
 183 /*** COMMONLY USED MACROS ***/
 184
 185 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 186    get one, two, and three bytes from the source text respectively.
 187    If there are not enough bytes in the source, they jump to
 188    `label_end_of_loop'.  The caller should set variables `coding',
 189    `src' and `src_end' to appropriate pointer in advance.  These
 190    macros are called from decoding routines `decode_coding_XXX', thus
 191    it is assumed that the source text is unibyte.  */
 192
 193 #define ONE_MORE_BYTE(c1)                                       \
 194   do {                                                          \
 195     if (src >= src_end)                                         \
 196       {                                                         \
 197         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 198         goto label_end_of_loop;                                 \
 199       }                                                         \
 200     c1 = *src++;                                                \
 201   } while (0)
 202
 203 #define TWO_MORE_BYTES(c1, c2)                                  \
 204   do {                                                          \
 205     if (src + 1 >= src_end)                                     \
 206       {                                                         \
 207         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 208         goto label_end_of_loop;                                 \
 209       }                                                         \
 210     c1 = *src++;                                                \
 211     c2 = *src++;                                                \
 212   } while (0)
 213
 214
 215 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 216    form if MULTIBYTEP is nonzero.  */
 217
 218 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 219   do {                                                          \
 220     if (src >= src_end)                                         \
 221       {                                                         \
 222         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 223         goto label_end_of_loop;                                 \
 224       }                                                         \
 225     c1 = *src++;                                                \
 226     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 227       c1 = *src++ - 0x20;                                       \
 228   } while (0)
 229
 230 /* Set C to the next character at the source text pointed by `src'.
 231    If there are not enough characters in the source, jump to
 232    `label_end_of_loop'.  The caller should set variables `coding'
 233    `src', `src_end', and `translation_table' to appropriate pointers
 234    in advance.  This macro is used in encoding routines
 235    `encode_coding_XXX', thus it assumes that the source text is in
 236    multibyte form except for 8-bit characters.  8-bit characters are
 237    in multibyte form if coding->src_multibyte is nonzero, else they
 238    are represented by a single byte.  */
 239
 240 #define ONE_MORE_CHAR(c)                                        \
 241   do {                                                          \
 242     int len = src_end - src;                                    \
 243     int bytes;                                                  \
 244     if (len <= 0)                                               \
 245       {                                                         \
 246         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 247         goto label_end_of_loop;                                 \
 248       }                                                         \
 249     if (coding->src_multibyte                                   \
 250         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 251       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 252     else                                                        \
 253       c = *src, bytes = 1;                                      \
 254     if (!NILP (translation_table))                              \
 255       c = translate_char (translation_table, c, -1, 0, 0);      \
 256     src += bytes;                                               \
 257   } while (0)
 258
 259
 260 /* Produce a multibyte form of characater C to `dst'.  Jump to
 261    `label_end_of_loop' if there's not enough space at `dst'.
 262
 263    If we are now in the middle of composition sequence, the decoded
 264    character may be ALTCHAR (for the current composition).  In that
 265    case, the character goes to coding->cmp_data->data instead of
 266    `dst'.
 267
 268    This macro is used in decoding routines.  */
 269
 270 #define EMIT_CHAR(c)                                                    \
 271   do {                                                                  \
 272     if (! COMPOSING_P (coding)                                          \
 273         || coding->composing == COMPOSITION_RELATIVE                    \
 274         || coding->composing == COMPOSITION_WITH_RULE)                  \
 275       {                                                                 \
 276         int bytes = CHAR_BYTES (c);                                     \
 277         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 278           {                                                             \
 279             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 280             goto label_end_of_loop;                                     \
 281           }                                                             \
 282         dst += CHAR_STRING (c, dst);                                    \
 283         coding->produced_char++;                                        \
 284       }                                                                 \
 285                                                                         \
 286     if (COMPOSING_P (coding)                                            \
 287         && coding->composing != COMPOSITION_RELATIVE)                   \
 288       {                                                                 \
 289         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 290         coding->composition_rule_follows                                \
 291           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 292       }                                                                 \
 293   } while (0)
 294
 295
 296 #define EMIT_ONE_BYTE(c)                                        \
 297   do {                                                          \
 298     if (dst >= (dst_bytes ? dst_end : src))                     \
 299       {                                                         \
 300         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 301         goto label_end_of_loop;                                 \
 302       }                                                         \
 303     *dst++ = c;                                                 \
 304   } while (0)
 305
 306 #define EMIT_TWO_BYTES(c1, c2)                                  \
 307   do {                                                          \
 308     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 309       {                                                         \
 310         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 311         goto label_end_of_loop;                                 \
 312       }                                                         \
 313     *dst++ = c1, *dst++ = c2;                                   \
 314   } while (0)
 315
 316 #define EMIT_BYTES(from, to)                                    \
 317   do {                                                          \
 318     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 319       {                                                         \
 320         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 321         goto label_end_of_loop;                                 \
 322       }                                                         \
 323     while (from < to)                                           \
 324       *dst++ = *from++;                                         \
 325   } while (0)
 326
 327 \f
 328 /*** 1. Preamble ***/
 329
 330 #ifdef emacs
 331 #include <config.h>
 332 #endif
 333
 334 #include <stdio.h>
 335
 336 #ifdef emacs
 337
 338 #include "lisp.h"
 339 #include "buffer.h"
 340 #include "charset.h"
 341 #include "composite.h"
 342 #include "ccl.h"
 343 #include "coding.h"
 344 #include "window.h"
 345
 346 #else  /* not emacs */
 347
 348 #include "mulelib.h"
 349
 350 #endif /* not emacs */
 351
 352 Lisp_Object Qcoding_system, Qeol_type;
 353 Lisp_Object Qbuffer_file_coding_system;
 354 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 355 Lisp_Object Qno_conversion, Qundecided;
 356 Lisp_Object Qcoding_system_history;
 357 Lisp_Object Qsafe_chars;
 358 Lisp_Object Qvalid_codes;
 359
 360 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 361 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 362 Lisp_Object Qstart_process, Qopen_network_stream;
 363 Lisp_Object Qtarget_idx;
 364
 365 Lisp_Object Vselect_safe_coding_system_function;
 366
 367 /* Mnemonic string for each format of end-of-line.  */
 368 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 369 /* Mnemonic string to indicate format of end-of-line is not yet
 370    decided.  */
 371 Lisp_Object eol_mnemonic_undecided;
 372
 373 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 374    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 375 int system_eol_type;
 376
 377 #ifdef emacs
 378
 379 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 380
 381 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 382
 383 /* Coding system emacs-mule and raw-text are for converting only
 384    end-of-line format.  */
 385 Lisp_Object Qemacs_mule, Qraw_text;
 386
 387 /* Coding-systems are handed between Emacs Lisp programs and C internal
 388    routines by the following three variables.  */
 389 /* Coding-system for reading files and receiving data from process.  */
 390 Lisp_Object Vcoding_system_for_read;
 391 /* Coding-system for writing files and sending data to process.  */
 392 Lisp_Object Vcoding_system_for_write;
 393 /* Coding-system actually used in the latest I/O.  */
 394 Lisp_Object Vlast_coding_system_used;
 395
 396 /* A vector of length 256 which contains information about special
 397    Latin codes (especially for dealing with Microsoft codes).  */
 398 Lisp_Object Vlatin_extra_code_table;
 399
 400 /* Flag to inhibit code conversion of end-of-line format.  */
 401 int inhibit_eol_conversion;
 402
 403 /* Flag to inhibit ISO2022 escape sequence detection.  */
 404 int inhibit_iso_escape_detection;
 405
 406 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 407 int inherit_process_coding_system;
 408
 409 /* Coding system to be used to encode text for terminal display.  */
 410 struct coding_system terminal_coding;
 411
 412 /* Coding system to be used to encode text for terminal display when
 413    terminal coding system is nil.  */
 414 struct coding_system safe_terminal_coding;
 415
 416 /* Coding system of what is sent from terminal keyboard.  */
 417 struct coding_system keyboard_coding;
 418
 419 /* Default coding system to be used to write a file.  */
 420 struct coding_system default_buffer_file_coding;
 421
 422 Lisp_Object Vfile_coding_system_alist;
 423 Lisp_Object Vprocess_coding_system_alist;
 424 Lisp_Object Vnetwork_coding_system_alist;
 425
 426 Lisp_Object Vlocale_coding_system;
 427
 428 #endif /* emacs */
 429
 430 Lisp_Object Qcoding_category, Qcoding_category_index;
 431
 432 /* List of symbols `coding-category-xxx' ordered by priority.  */
 433 Lisp_Object Vcoding_category_list;
 434
 435 /* Table of coding categories (Lisp symbols).  */
 436 Lisp_Object Vcoding_category_table;
 437
 438 /* Table of names of symbol for each coding-category.  */
 439 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 440   "coding-category-emacs-mule",
 441   "coding-category-sjis",
 442   "coding-category-iso-7",
 443   "coding-category-iso-7-tight",
 444   "coding-category-iso-8-1",
 445   "coding-category-iso-8-2",
 446   "coding-category-iso-7-else",
 447   "coding-category-iso-8-else",
 448   "coding-category-ccl",
 449   "coding-category-big5",
 450   "coding-category-utf-8",
 451   "coding-category-utf-16-be",
 452   "coding-category-utf-16-le",
 453   "coding-category-raw-text",
 454   "coding-category-binary"
 455 };
 456
 457 /* Table of pointers to coding systems corresponding to each coding
 458    categories.  */
 459 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 460
 461 /* Table of coding category masks.  Nth element is a mask for a coding
 462    cateogry of which priority is Nth.  */
 463 static
 464 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 465
 466 /* Flag to tell if we look up translation table on character code
 467    conversion.  */
 468 Lisp_Object Venable_character_translation;
 469 /* Standard translation table to look up on decoding (reading).  */
 470 Lisp_Object Vstandard_translation_table_for_decode;
 471 /* Standard translation table to look up on encoding (writing).  */
 472 Lisp_Object Vstandard_translation_table_for_encode;
 473
 474 Lisp_Object Qtranslation_table;
 475 Lisp_Object Qtranslation_table_id;
 476 Lisp_Object Qtranslation_table_for_decode;
 477 Lisp_Object Qtranslation_table_for_encode;
 478
 479 /* Alist of charsets vs revision number.  */
 480 Lisp_Object Vcharset_revision_alist;
 481
 482 /* Default coding systems used for process I/O.  */
 483 Lisp_Object Vdefault_process_coding_system;
 484
 485 /* Global flag to tell that we can't call post-read-conversion and
 486    pre-write-conversion functions.  Usually the value is zero, but it
 487    is set to 1 temporarily while such functions are running.  This is
 488    to avoid infinite recursive call.  */
 489 static int inhibit_pre_post_conversion;
 490
 491 /* Char-table containing safe coding systems of each character.  */
 492 Lisp_Object Vchar_coding_system_table;
 493 Lisp_Object Qchar_coding_system;
 494
 495 /* Return `safe-chars' property of coding system CODING.  Don't check
 496    validity of CODING.  */
 497
 498 Lisp_Object
 499 coding_safe_chars (coding)
 500      struct coding_system *coding;
 501 {
 502   Lisp_Object coding_spec, plist, safe_chars;
 503
 504   coding_spec = Fget (coding->symbol, Qcoding_system);
 505   plist = XVECTOR (coding_spec)->contents[3];
 506   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 507   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 508 }
 509
 510 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 511   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 512
 513 \f
 514 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 515
 516 /* Emacs' internal format for encoding multiple character sets is a
 517    kind of multi-byte encoding, i.e. characters are encoded by
 518    variable-length sequences of one-byte codes.
 519
 520    ASCII characters and control characters (e.g. `tab', `newline') are
 521    represented by one-byte sequences which are their ASCII codes, in
 522    the range 0x00 through 0x7F.
 523
 524    8-bit characters of the range 0x80..0x9F are represented by
 525    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 526    code + 0x20).
 527
 528    8-bit characters of the range 0xA0..0xFF are represented by
 529    one-byte sequences which are their 8-bit code.
 530
 531    The other characters are represented by a sequence of `base
 532    leading-code', optional `extended leading-code', and one or two
 533    `position-code's.  The length of the sequence is determined by the
 534    base leading-code.  Leading-code takes the range 0x80 through 0x9F,
 535    whereas extended leading-code and position-code take the range 0xA0
 536    through 0xFF.  See `charset.h' for more details about leading-code
 537    and position-code.
 538
 539    --- CODE RANGE of Emacs' internal format ---
 540    character set        range
 541    -------------        -----
 542    ascii                0x00..0x7F
 543    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 544    eight-bit-graphic    0xA0..0xBF
 545    ELSE                 0x81..0x9F + [0xA0..0xFF]+
 546    ---------------------------------------------
 547
 548   */
 549
 550 enum emacs_code_class_type emacs_code_class[256];
 551
 552 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 553    Check if a text is encoded in Emacs' internal format.  If it is,
 554    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 555
 556 static int
 557 detect_coding_emacs_mule (src, src_end, multibytep)
 558       unsigned char *src, *src_end;
 559       int multibytep;
 560 {
 561   unsigned char c;
 562   int composing = 0;
 563   /* Dummy for ONE_MORE_BYTE.  */
 564   struct coding_system dummy_coding;
 565   struct coding_system *coding = &dummy_coding;
 566
 567   while (1)
 568     {
 569       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 570
 571       if (composing)
 572         {
 573           if (c < 0xA0)
 574             composing = 0;
 575           else if (c == 0xA0)
 576             {
 577               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 578               c &= 0x7F;
 579             }
 580           else
 581             c -= 0x20;
 582         }
 583
 584       if (c < 0x20)
 585         {
 586           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 587             return 0;
 588         }
 589       else if (c >= 0x80 && c < 0xA0)
 590         {
 591           if (c == 0x80)
 592             /* Old leading code for a composite character.  */
 593             composing = 1;
 594           else
 595             {
 596               unsigned char *src_base = src - 1;
 597               int bytes;
 598
 599               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 600                                                bytes))
 601                 return 0;
 602               src = src_base + bytes;
 603             }
 604         }
 605     }
 606  label_end_of_loop:
 607   return CODING_CATEGORY_MASK_EMACS_MULE;
 608 }
 609
 610
 611 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 612
 613 static void
 614 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 615      struct coding_system *coding;
 616      unsigned char *source, *destination;
 617      int src_bytes, dst_bytes;
 618 {
 619   unsigned char *src = source;
 620   unsigned char *src_end = source + src_bytes;
 621   unsigned char *dst = destination;
 622   unsigned char *dst_end = destination + dst_bytes;
 623   /* SRC_BASE remembers the start position in source in each loop.
 624      The loop will be exited when there's not enough source code, or
 625      when there's not enough destination area to produce a
 626      character.  */
 627   unsigned char *src_base;
 628
 629   coding->produced_char = 0;
 630   while ((src_base = src) < src_end)
 631     {
 632       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 633       int bytes;
 634
 635       if (*src == '\r')
 636         {
 637           int c = *src++;
 638
 639           if (coding->eol_type == CODING_EOL_CR)
 640             c = '\n';
 641           else if (coding->eol_type == CODING_EOL_CRLF)
 642             {
 643               ONE_MORE_BYTE (c);
 644               if (c != '\n')
 645                 {
 646                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 647                     {
 648                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
 649                       goto label_end_of_loop;
 650                     }
 651                   src--;
 652                   c = '\r';
 653                 }
 654             }
 655           *dst++ = c;
 656           coding->produced_char++;
 657           continue;
 658         }
 659       else if (*src == '\n')
 660         {
 661           if ((coding->eol_type == CODING_EOL_CR
 662                || coding->eol_type == CODING_EOL_CRLF)
 663               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 664             {
 665               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 666               goto label_end_of_loop;
 667             }
 668           *dst++ = *src++;
 669           coding->produced_char++;
 670           continue;
 671         }
 672       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 673         {
 674           p = src;
 675           src += bytes;
 676         }
 677       else
 678         {
 679           bytes = CHAR_STRING (*src, tmp);
 680           p = tmp;
 681           src++;
 682         }
 683       if (dst + bytes >= (dst_bytes ? dst_end : src))
 684         {
 685           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 686           break;
 687         }
 688       while (bytes--) *dst++ = *p++;
 689       coding->produced_char++;
 690     }
 691  label_end_of_loop:
 692   coding->consumed = coding->consumed_char = src_base - source;
 693   coding->produced = dst - destination;
 694 }
 695
 696 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
 697   encode_eol (coding, source, destination, src_bytes, dst_bytes)
 698
 699
 700 \f
 701 /*** 3. ISO2022 handlers ***/
 702
 703 /* The following note describes the coding system ISO2022 briefly.
 704    Since the intention of this note is to help understand the
 705    functions in this file, some parts are NOT ACCURATE or OVERLY
 706    SIMPLIFIED.  For thorough understanding, please refer to the
 707    original document of ISO2022.
 708
 709    ISO2022 provides many mechanisms to encode several character sets
 710    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 711    is encoded using bytes less than 128.  This may make the encoded
 712    text a little bit longer, but the text passes more easily through
 713    several gateways, some of which strip off MSB (Most Signigant Bit).
 714
 715    There are two kinds of character sets: control character set and
 716    graphic character set.  The former contains control characters such
 717    as `newline' and `escape' to provide control functions (control
 718    functions are also provided by escape sequences).  The latter
 719    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 720    two control character sets and many graphic character sets.
 721
 722    Graphic character sets are classified into one of the following
 723    four classes, according to the number of bytes (DIMENSION) and
 724    number of characters in one dimension (CHARS) of the set:
 725    - DIMENSION1_CHARS94
 726    - DIMENSION1_CHARS96
 727    - DIMENSION2_CHARS94
 728    - DIMENSION2_CHARS96
 729
 730    In addition, each character set is assigned an identification tag,
 731    unique for each set, called "final character" (denoted as <F>
 732    hereafter).  The <F> of each character set is decided by ECMA(*)
 733    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 734    (0x30..0x3F are for private use only).
 735
 736    Note (*): ECMA = European Computer Manufacturers Association
 737
 738    Here are examples of graphic character set [NAME(<F>)]:
 739         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 740         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 741         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 742         o DIMENSION2_CHARS96 -- none for the moment
 743
 744    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 745         C0 [0x00..0x1F] -- control character plane 0
 746         GL [0x20..0x7F] -- graphic character plane 0
 747         C1 [0x80..0x9F] -- control character plane 1
 748         GR [0xA0..0xFF] -- graphic character plane 1
 749
 750    A control character set is directly designated and invoked to C0 or
 751    C1 by an escape sequence.  The most common case is that:
 752    - ISO646's  control character set is designated/invoked to C0, and
 753    - ISO6429's control character set is designated/invoked to C1,
 754    and usually these designations/invocations are omitted in encoded
 755    text.  In a 7-bit environment, only C0 can be used, and a control
 756    character for C1 is encoded by an appropriate escape sequence to
 757    fit into the environment.  All control characters for C1 are
 758    defined to have corresponding escape sequences.
 759
 760    A graphic character set is at first designated to one of four
 761    graphic registers (G0 through G3), then these graphic registers are
 762    invoked to GL or GR.  These designations and invocations can be
 763    done independently.  The most common case is that G0 is invoked to
 764    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 765    these invocations and designations are omitted in encoded text.
 766    In a 7-bit environment, only GL can be used.
 767
 768    When a graphic character set of CHARS94 is invoked to GL, codes
 769    0x20 and 0x7F of the GL area work as control characters SPACE and
 770    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 771    be used.
 772
 773    There are two ways of invocation: locking-shift and single-shift.
 774    With locking-shift, the invocation lasts until the next different
 775    invocation, whereas with single-shift, the invocation affects the
 776    following character only and doesn't affect the locking-shift
 777    state.  Invocations are done by the following control characters or
 778    escape sequences:
 779
 780    ----------------------------------------------------------------------
 781    abbrev  function                  cntrl escape seq   description
 782    ----------------------------------------------------------------------
 783    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 784    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 785    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 786    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 787    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 788    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 789    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 790    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 791    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 792    ----------------------------------------------------------------------
 793    (*) These are not used by any known coding system.
 794
 795    Control characters for these functions are defined by macros
 796    ISO_CODE_XXX in `coding.h'.
 797
 798    Designations are done by the following escape sequences:
 799    ----------------------------------------------------------------------
 800    escape sequence      description
 801    ----------------------------------------------------------------------
 802    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 803    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 804    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 805    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 806    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 807    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 808    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 809    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 810    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 811    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 812    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 813    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 814    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 815    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 816    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 817    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 818    ----------------------------------------------------------------------
 819
 820    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 821    of dimension 1, chars 94, and final character <F>, etc...
 822
 823    Note (*): Although these designations are not allowed in ISO2022,
 824    Emacs accepts them on decoding, and produces them on encoding
 825    CHARS96 character sets in a coding system which is characterized as
 826    7-bit environment, non-locking-shift, and non-single-shift.
 827
 828    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 829    '(' can be omitted.  We refer to this as "short-form" hereafter.
 830
 831    Now you may notice that there are a lot of ways for encoding the
 832    same multilingual text in ISO2022.  Actually, there exist many
 833    coding systems such as Compound Text (used in X11's inter client
 834    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 835    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 836    localized platforms), and all of these are variants of ISO2022.
 837
 838    In addition to the above, Emacs handles two more kinds of escape
 839    sequences: ISO6429's direction specification and Emacs' private
 840    sequence for specifying character composition.
 841
 842    ISO6429's direction specification takes the following form:
 843         o CSI ']'      -- end of the current direction
 844         o CSI '0' ']'  -- end of the current direction
 845         o CSI '1' ']'  -- start of left-to-right text
 846         o CSI '2' ']'  -- start of right-to-left text
 847    The control character CSI (0x9B: control sequence introducer) is
 848    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 849
 850    Character composition specification takes the following form:
 851         o ESC '0' -- start relative composition
 852         o ESC '1' -- end composition
 853         o ESC '2' -- start rule-base composition (*)
 854         o ESC '3' -- start relative composition with alternate chars  (**)
 855         o ESC '4' -- start rule-base composition with alternate chars  (**)
 856   Since these are not standard escape sequences of any ISO standard,
 857   the use of them for these meaning is restricted to Emacs only.
 858
 859   (*) This form is used only in Emacs 20.5 and the older versions,
 860   but the newer versions can safely decode it.
 861   (**) This form is used only in Emacs 21.1 and the newer versions,
 862   and the older versions can't decode it.
 863
 864   Here's a list of examples usages of these composition escape
 865   sequences (categorized by `enum composition_method').
 866
 867   COMPOSITION_RELATIVE:
 868         ESC 0 CHAR [ CHAR ] ESC 1
 869   COMPOSITOIN_WITH_RULE:
 870         ESC 2 CHAR [ RULE CHAR ] ESC 1
 871   COMPOSITION_WITH_ALTCHARS:
 872         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 873   COMPOSITION_WITH_RULE_ALTCHARS:
 874         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 875
 876 enum iso_code_class_type iso_code_class[256];
 877
 878 #define CHARSET_OK(idx, charset, c)                                     \
 879   (coding_system_table[idx]                                             \
 880    && (charset == CHARSET_ASCII                                         \
 881        || (safe_chars = coding_safe_chars (coding_system_table[idx]),   \
 882            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
 883    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
 884                                               charset)                  \
 885        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
 886
 887 #define SHIFT_OUT_OK(idx) \
 888   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 889
 890 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 891    Check if a text is encoded in ISO2022.  If it is, returns an
 892    integer in which appropriate flag bits any of:
 893         CODING_CATEGORY_MASK_ISO_7
 894         CODING_CATEGORY_MASK_ISO_7_TIGHT
 895         CODING_CATEGORY_MASK_ISO_8_1
 896         CODING_CATEGORY_MASK_ISO_8_2
 897         CODING_CATEGORY_MASK_ISO_7_ELSE
 898         CODING_CATEGORY_MASK_ISO_8_ELSE
 899    are set.  If a code which should never appear in ISO2022 is found,
 900    returns 0.  */
 901
 902 static int
 903 detect_coding_iso2022 (src, src_end, multibytep)
 904      unsigned char *src, *src_end;
 905      int multibytep;
 906 {
 907   int mask = CODING_CATEGORY_MASK_ISO;
 908   int mask_found = 0;
 909   int reg[4], shift_out = 0, single_shifting = 0;
 910   int c, c1, i, charset;
 911   /* Dummy for ONE_MORE_BYTE.  */
 912   struct coding_system dummy_coding;
 913   struct coding_system *coding = &dummy_coding;
 914   Lisp_Object safe_chars;
 915
 916   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 917   while (mask && src < src_end)
 918     {
 919       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 920       switch (c)
 921         {
 922         case ISO_CODE_ESC:
 923           if (inhibit_iso_escape_detection)
 924             break;
 925           single_shifting = 0;
 926           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 927           if (c >= '(' && c <= '/')
 928             {
 929               /* Designation sequence for a charset of dimension 1.  */
 930               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
 931               if (c1 < ' ' || c1 >= 0x80
 932                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 933                 /* Invalid designation sequence.  Just ignore.  */
 934                 break;
 935               reg[(c - '(') % 4] = charset;
 936             }
 937           else if (c == '$')
 938             {
 939               /* Designation sequence for a charset of dimension 2.  */
 940               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 941               if (c >= '@' && c <= 'B')
 942                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 943                 reg[0] = charset = iso_charset_table[1][0][c];
 944               else if (c >= '(' && c <= '/')
 945                 {
 946                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
 947                   if (c1 < ' ' || c1 >= 0x80
 948                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 949                     /* Invalid designation sequence.  Just ignore.  */
 950                     break;
 951                   reg[(c - '(') % 4] = charset;
 952                 }
 953               else
 954                 /* Invalid designation sequence.  Just ignore.  */
 955                 break;
 956             }
 957           else if (c == 'N' || c == 'O')
 958             {
 959               /* ESC <Fe> for SS2 or SS3.  */
 960               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 961               break;
 962             }
 963           else if (c >= '0' && c <= '4')
 964             {
 965               /* ESC <Fp> for start/end composition.  */
 966               mask_found |= CODING_CATEGORY_MASK_ISO;
 967               break;
 968             }
 969           else
 970             /* Invalid escape sequence.  Just ignore.  */
 971             break;
 972
 973           /* We found a valid designation sequence for CHARSET.  */
 974           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 975           c = MAKE_CHAR (charset, 0, 0);
 976           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
 977             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 978           else
 979             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 980           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
 981             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 982           else
 983             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 984           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
 985             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 986           else
 987             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 988           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
 989             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 990           else
 991             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 992           break;
 993
 994         case ISO_CODE_SO:
 995           if (inhibit_iso_escape_detection)
 996             break;
 997           single_shifting = 0;
 998           if (shift_out == 0
 999               && (reg[1] >= 0
1000                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1001                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1002             {
1003               /* Locking shift out.  */
1004               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1005               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1006             }
1007           break;
1008
1009         case ISO_CODE_SI:
1010           if (inhibit_iso_escape_detection)
1011             break;
1012           single_shifting = 0;
1013           if (shift_out == 1)
1014             {
1015               /* Locking shift in.  */
1016               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1017               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1018             }
1019           break;
1020
1021         case ISO_CODE_CSI:
1022           single_shifting = 0;
1023         case ISO_CODE_SS2:
1024         case ISO_CODE_SS3:
1025           {
1026             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1027
1028             if (inhibit_iso_escape_detection)
1029               break;
1030             if (c != ISO_CODE_CSI)
1031               {
1032                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1033                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1034                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1035                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1036                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1037                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1038                 single_shifting = 1;
1039               }
1040             if (VECTORP (Vlatin_extra_code_table)
1041                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1042               {
1043                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1044                     & CODING_FLAG_ISO_LATIN_EXTRA)
1045                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1046                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1047                     & CODING_FLAG_ISO_LATIN_EXTRA)
1048                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1049               }
1050             mask &= newmask;
1051             mask_found |= newmask;
1052           }
1053           break;
1054
1055         default:
1056           if (c < 0x80)
1057             {
1058               single_shifting = 0;
1059               break;
1060             }
1061           else if (c < 0xA0)
1062             {
1063               single_shifting = 0;
1064               if (VECTORP (Vlatin_extra_code_table)
1065                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1066                 {
1067                   int newmask = 0;
1068
1069                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1070                       & CODING_FLAG_ISO_LATIN_EXTRA)
1071                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1072                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1073                       & CODING_FLAG_ISO_LATIN_EXTRA)
1074                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1075                   mask &= newmask;
1076                   mask_found |= newmask;
1077                 }
1078               else
1079                 return 0;
1080             }
1081           else
1082             {
1083               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1084                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1085               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1086               /* Check the length of succeeding codes of the range
1087                  0xA0..0FF.  If the byte length is odd, we exclude
1088                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1089                  when we are not single shifting.  */
1090               if (!single_shifting
1091                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1092                 {
1093                   int i = 1;
1094                   while (src < src_end)
1095                     {
1096                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1097                       if (c < 0xA0)
1098                         break;
1099                       i++;
1100                     }
1101
1102                   if (i & 1 && src < src_end)
1103                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1104                   else
1105                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1106                 }
1107             }
1108           break;
1109         }
1110     }
1111  label_end_of_loop:
1112   return (mask & mask_found);
1113 }
1114
1115 /* Decode a character of which charset is CHARSET, the 1st position
1116    code is C1, the 2nd position code is C2, and return the decoded
1117    character code.  If the variable `translation_table' is non-nil,
1118    returned the translated code.  */
1119
1120 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1121   (NILP (translation_table)                     \
1122    ? MAKE_CHAR (charset, c1, c2)                \
1123    : translate_char (translation_table, -1, charset, c1, c2))
1124
1125 /* Set designation state into CODING.  */
1126 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1127   do {                                                                     \
1128     int charset, c;                                                        \
1129                                                                            \
1130     if (final_char < '0' || final_char >= 128)                             \
1131       goto label_invalid_code;                                             \
1132     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1133                                  make_number (chars),                      \
1134                                  make_number (final_char));                \
1135     c = MAKE_CHAR (charset, 0, 0);                                         \
1136     if (charset >= 0                                                       \
1137         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1138             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1139       {                                                                    \
1140         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1141             && reg == 0                                                    \
1142             && charset == CHARSET_ASCII)                                   \
1143           {                                                                \
1144             /* We should insert this designation sequence as is so         \
1145                that it is surely written back to a file.  */               \
1146             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1147             goto label_invalid_code;                                       \
1148           }                                                                \
1149         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1150         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1151             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1152           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1153         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1154       }                                                                    \
1155     else                                                                   \
1156       {                                                                    \
1157         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1158         goto label_invalid_code;                                           \
1159       }                                                                    \
1160   } while (0)
1161
1162 /* Allocate a memory block for storing information about compositions.
1163    The block is chained to the already allocated blocks.  */
1164
1165 void
1166 coding_allocate_composition_data (coding, char_offset)
1167      struct coding_system *coding;
1168      int char_offset;
1169 {
1170   struct composition_data *cmp_data
1171     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1172
1173   cmp_data->char_offset = char_offset;
1174   cmp_data->used = 0;
1175   cmp_data->prev = coding->cmp_data;
1176   cmp_data->next = NULL;
1177   if (coding->cmp_data)
1178     coding->cmp_data->next = cmp_data;
1179   coding->cmp_data = cmp_data;
1180   coding->cmp_data_start = 0;
1181 }
1182
1183 /* Record the starting position START and METHOD of one composition.  */
1184
1185 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1186   do {                                                          \
1187     struct composition_data *cmp_data = coding->cmp_data;       \
1188     int *data = cmp_data->data + cmp_data->used;                \
1189     coding->cmp_data_start = cmp_data->used;                    \
1190     data[0] = -1;                                               \
1191     data[1] = cmp_data->char_offset + start;                    \
1192     data[3] = (int) method;                                     \
1193     cmp_data->used += 4;                                        \
1194   } while (0)
1195
1196 /* Record the ending position END of the current composition.  */
1197
1198 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1199   do {                                                          \
1200     struct composition_data *cmp_data = coding->cmp_data;       \
1201     int *data = cmp_data->data + coding->cmp_data_start;        \
1202     data[0] = cmp_data->used - coding->cmp_data_start;          \
1203     data[2] = cmp_data->char_offset + end;                      \
1204   } while (0)
1205
1206 /* Record one COMPONENT (alternate character or composition rule).  */
1207
1208 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1209   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1210
1211 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1212
1213 #define DECODE_COMPOSITION_START(c1)                                       \
1214   do {                                                                     \
1215     if (coding->composing == COMPOSITION_DISABLED)                         \
1216       {                                                                    \
1217         *dst++ = ISO_CODE_ESC;                                             \
1218         *dst++ = c1 & 0x7f;                                                \
1219         coding->produced_char += 2;                                        \
1220       }                                                                    \
1221     else if (!COMPOSING_P (coding))                                        \
1222       {                                                                    \
1223         /* This is surely the start of a composition.  We must be sure     \
1224            that coding->cmp_data has enough space to store the             \
1225            information about the composition.  If not, terminate the       \
1226            current decoding loop, allocate one more memory block for       \
1227            coding->cmp_data in the calller, then start the decoding        \
1228            loop again.  We can't allocate memory here directly because     \
1229            it may cause buffer/string relocation.  */                      \
1230         if (!coding->cmp_data                                              \
1231             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1232                 >= COMPOSITION_DATA_SIZE))                                 \
1233           {                                                                \
1234             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1235             goto label_end_of_loop;                                        \
1236           }                                                                \
1237         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1238                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1239                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1240                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1241         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1242                                       coding->composing);                  \
1243         coding->composition_rule_follows = 0;                              \
1244       }                                                                    \
1245     else                                                                   \
1246       {                                                                    \
1247         /* We are already handling a composition.  If the method is        \
1248            the following two, the codes following the current escape       \
1249            sequence are actual characters stored in a buffer.  */          \
1250         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1251             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1252           {                                                                \
1253             coding->composing = COMPOSITION_RELATIVE;                      \
1254             coding->composition_rule_follows = 0;                          \
1255           }                                                                \
1256       }                                                                    \
1257   } while (0)
1258
1259 /* Handle compositoin end sequence ESC 1.  */
1260
1261 #define DECODE_COMPOSITION_END(c1)                                      \
1262   do {                                                                  \
1263     if (coding->composing == COMPOSITION_DISABLED)                      \
1264       {                                                                 \
1265         *dst++ = ISO_CODE_ESC;                                          \
1266         *dst++ = c1;                                                    \
1267         coding->produced_char += 2;                                     \
1268       }                                                                 \
1269     else                                                                \
1270       {                                                                 \
1271         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1272         coding->composing = COMPOSITION_NO;                             \
1273       }                                                                 \
1274   } while (0)
1275
1276 /* Decode a composition rule from the byte C1 (and maybe one more byte
1277    from SRC) and store one encoded composition rule in
1278    coding->cmp_data.  */
1279
1280 #define DECODE_COMPOSITION_RULE(c1)                                     \
1281   do {                                                                  \
1282     int rule = 0;                                                       \
1283     (c1) -= 32;                                                         \
1284     if (c1 < 81)                /* old format (before ver.21) */        \
1285       {                                                                 \
1286         int gref = (c1) / 9;                                            \
1287         int nref = (c1) % 9;                                            \
1288         if (gref == 4) gref = 10;                                       \
1289         if (nref == 4) nref = 10;                                       \
1290         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1291       }                                                                 \
1292     else if (c1 < 93)           /* new format (after ver.21) */         \
1293       {                                                                 \
1294         ONE_MORE_BYTE (c2);                                             \
1295         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1296       }                                                                 \
1297     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1298     coding->composition_rule_follows = 0;                               \
1299   } while (0)
1300
1301
1302 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1303
1304 static void
1305 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1306      struct coding_system *coding;
1307      unsigned char *source, *destination;
1308      int src_bytes, dst_bytes;
1309 {
1310   unsigned char *src = source;
1311   unsigned char *src_end = source + src_bytes;
1312   unsigned char *dst = destination;
1313   unsigned char *dst_end = destination + dst_bytes;
1314   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1315   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1316   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1317   /* SRC_BASE remembers the start position in source in each loop.
1318      The loop will be exited when there's not enough source code
1319      (within macro ONE_MORE_BYTE), or when there's not enough
1320      destination area to produce a character (within macro
1321      EMIT_CHAR).  */
1322   unsigned char *src_base;
1323   int c, charset;
1324   Lisp_Object translation_table;
1325   Lisp_Object safe_chars;
1326
1327   safe_chars = coding_safe_chars (coding);
1328
1329   if (NILP (Venable_character_translation))
1330     translation_table = Qnil;
1331   else
1332     {
1333       translation_table = coding->translation_table_for_decode;
1334       if (NILP (translation_table))
1335         translation_table = Vstandard_translation_table_for_decode;
1336     }
1337
1338   coding->result = CODING_FINISH_NORMAL;
1339
1340   while (1)
1341     {
1342       int c1, c2;
1343
1344       src_base = src;
1345       ONE_MORE_BYTE (c1);
1346
1347       /* We produce no character or one character.  */
1348       switch (iso_code_class [c1])
1349         {
1350         case ISO_0x20_or_0x7F:
1351           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1352             {
1353               DECODE_COMPOSITION_RULE (c1);
1354               continue;
1355             }
1356           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1357             {
1358               /* This is SPACE or DEL.  */
1359               charset = CHARSET_ASCII;
1360               break;
1361             }
1362           /* This is a graphic character, we fall down ...  */
1363
1364         case ISO_graphic_plane_0:
1365           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1366             {
1367               DECODE_COMPOSITION_RULE (c1);
1368               continue;
1369             }
1370           charset = charset0;
1371           break;
1372
1373         case ISO_0xA0_or_0xFF:
1374           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1375               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1376             goto label_invalid_code;
1377           /* This is a graphic character, we fall down ... */
1378
1379         case ISO_graphic_plane_1:
1380           if (charset1 < 0)
1381             goto label_invalid_code;
1382           charset = charset1;
1383           break;
1384
1385         case ISO_control_0:
1386           if (COMPOSING_P (coding))
1387             DECODE_COMPOSITION_END ('1');
1388
1389           /* All ISO2022 control characters in this class have the
1390              same representation in Emacs internal format.  */
1391           if (c1 == '\n'
1392               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1393               && (coding->eol_type == CODING_EOL_CR
1394                   || coding->eol_type == CODING_EOL_CRLF))
1395             {
1396               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1397               goto label_end_of_loop;
1398             }
1399           charset = CHARSET_ASCII;
1400           break;
1401
1402         case ISO_control_1:
1403           if (COMPOSING_P (coding))
1404             DECODE_COMPOSITION_END ('1');
1405           goto label_invalid_code;
1406
1407         case ISO_carriage_return:
1408           if (COMPOSING_P (coding))
1409             DECODE_COMPOSITION_END ('1');
1410
1411           if (coding->eol_type == CODING_EOL_CR)
1412             c1 = '\n';
1413           else if (coding->eol_type == CODING_EOL_CRLF)
1414             {
1415               ONE_MORE_BYTE (c1);
1416               if (c1 != ISO_CODE_LF)
1417                 {
1418                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1419                     {
1420                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1421                       goto label_end_of_loop;
1422                     }
1423                   src--;
1424                   c1 = '\r';
1425                 }
1426             }
1427           charset = CHARSET_ASCII;
1428           break;
1429
1430         case ISO_shift_out:
1431           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1432               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1433             goto label_invalid_code;
1434           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1435           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1436           continue;
1437
1438         case ISO_shift_in:
1439           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1440             goto label_invalid_code;
1441           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1442           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1443           continue;
1444
1445         case ISO_single_shift_2_7:
1446         case ISO_single_shift_2:
1447           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1448             goto label_invalid_code;
1449           /* SS2 is handled as an escape sequence of ESC 'N' */
1450           c1 = 'N';
1451           goto label_escape_sequence;
1452
1453         case ISO_single_shift_3:
1454           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1455             goto label_invalid_code;
1456           /* SS2 is handled as an escape sequence of ESC 'O' */
1457           c1 = 'O';
1458           goto label_escape_sequence;
1459
1460         case ISO_control_sequence_introducer:
1461           /* CSI is handled as an escape sequence of ESC '[' ...  */
1462           c1 = '[';
1463           goto label_escape_sequence;
1464
1465         case ISO_escape:
1466           ONE_MORE_BYTE (c1);
1467         label_escape_sequence:
1468           /* Escape sequences handled by Emacs are invocation,
1469              designation, direction specification, and character
1470              composition specification.  */
1471           switch (c1)
1472             {
1473             case '&':           /* revision of following character set */
1474               ONE_MORE_BYTE (c1);
1475               if (!(c1 >= '@' && c1 <= '~'))
1476                 goto label_invalid_code;
1477               ONE_MORE_BYTE (c1);
1478               if (c1 != ISO_CODE_ESC)
1479                 goto label_invalid_code;
1480               ONE_MORE_BYTE (c1);
1481               goto label_escape_sequence;
1482
1483             case '$':           /* designation of 2-byte character set */
1484               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1485                 goto label_invalid_code;
1486               ONE_MORE_BYTE (c1);
1487               if (c1 >= '@' && c1 <= 'B')
1488                 {       /* designation of JISX0208.1978, GB2312.1980,
1489                            or JISX0208.1980 */
1490                   DECODE_DESIGNATION (0, 2, 94, c1);
1491                 }
1492               else if (c1 >= 0x28 && c1 <= 0x2B)
1493                 {       /* designation of DIMENSION2_CHARS94 character set */
1494                   ONE_MORE_BYTE (c2);
1495                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1496                 }
1497               else if (c1 >= 0x2C && c1 <= 0x2F)
1498                 {       /* designation of DIMENSION2_CHARS96 character set */
1499                   ONE_MORE_BYTE (c2);
1500                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1501                 }
1502               else
1503                 goto label_invalid_code;
1504               /* We must update these variables now.  */
1505               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1506               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1507               continue;
1508
1509             case 'n':           /* invocation of locking-shift-2 */
1510               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1511                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1512                 goto label_invalid_code;
1513               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1514               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1515               continue;
1516
1517             case 'o':           /* invocation of locking-shift-3 */
1518               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1519                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1520                 goto label_invalid_code;
1521               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1522               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1523               continue;
1524
1525             case 'N':           /* invocation of single-shift-2 */
1526               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1527                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1528                 goto label_invalid_code;
1529               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1530               ONE_MORE_BYTE (c1);
1531               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1532                 goto label_invalid_code;
1533               break;
1534
1535             case 'O':           /* invocation of single-shift-3 */
1536               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1537                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1538                 goto label_invalid_code;
1539               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1540               ONE_MORE_BYTE (c1);
1541               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1542                 goto label_invalid_code;
1543               break;
1544
1545             case '0': case '2': case '3': case '4': /* start composition */
1546               DECODE_COMPOSITION_START (c1);
1547               continue;
1548
1549             case '1':           /* end composition */
1550               DECODE_COMPOSITION_END (c1);
1551               continue;
1552
1553             case '[':           /* specification of direction */
1554               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1555                 goto label_invalid_code;
1556               /* For the moment, nested direction is not supported.
1557                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1558                  left-to-right, and nozero means right-to-left.  */
1559               ONE_MORE_BYTE (c1);
1560               switch (c1)
1561                 {
1562                 case ']':       /* end of the current direction */
1563                   coding->mode &= ~CODING_MODE_DIRECTION;
1564
1565                 case '0':       /* end of the current direction */
1566                 case '1':       /* start of left-to-right direction */
1567                   ONE_MORE_BYTE (c1);
1568                   if (c1 == ']')
1569                     coding->mode &= ~CODING_MODE_DIRECTION;
1570                   else
1571                     goto label_invalid_code;
1572                   break;
1573
1574                 case '2':       /* start of right-to-left direction */
1575                   ONE_MORE_BYTE (c1);
1576                   if (c1 == ']')
1577                     coding->mode |= CODING_MODE_DIRECTION;
1578                   else
1579                     goto label_invalid_code;
1580                   break;
1581
1582                 default:
1583                   goto label_invalid_code;
1584                 }
1585               continue;
1586
1587             default:
1588               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1589                 goto label_invalid_code;
1590               if (c1 >= 0x28 && c1 <= 0x2B)
1591                 {       /* designation of DIMENSION1_CHARS94 character set */
1592                   ONE_MORE_BYTE (c2);
1593                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1594                 }
1595               else if (c1 >= 0x2C && c1 <= 0x2F)
1596                 {       /* designation of DIMENSION1_CHARS96 character set */
1597                   ONE_MORE_BYTE (c2);
1598                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1599                 }
1600               else
1601                 goto label_invalid_code;
1602               /* We must update these variables now.  */
1603               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1604               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1605               continue;
1606             }
1607         }
1608
1609       /* Now we know CHARSET and 1st position code C1 of a character.
1610          Produce a multibyte sequence for that character while getting
1611          2nd position code C2 if necessary.  */
1612       if (CHARSET_DIMENSION (charset) == 2)
1613         {
1614           ONE_MORE_BYTE (c2);
1615           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1616             /* C2 is not in a valid range.  */
1617             goto label_invalid_code;
1618         }
1619       c = DECODE_ISO_CHARACTER (charset, c1, c2);
1620       EMIT_CHAR (c);
1621       continue;
1622
1623     label_invalid_code:
1624       coding->errors++;
1625       if (COMPOSING_P (coding))
1626         DECODE_COMPOSITION_END ('1');
1627       src = src_base;
1628       c = *src++;
1629       EMIT_CHAR (c);
1630     }
1631
1632  label_end_of_loop:
1633   coding->consumed = coding->consumed_char = src_base - source;
1634   coding->produced = dst - destination;
1635   return;
1636 }
1637
1638
1639 /* ISO2022 encoding stuff.  */
1640
1641 /*
1642    It is not enough to say just "ISO2022" on encoding, we have to
1643    specify more details.  In Emacs, each coding system of ISO2022
1644    variant has the following specifications:
1645         1. Initial designation to G0 thru G3.
1646         2. Allows short-form designation?
1647         3. ASCII should be designated to G0 before control characters?
1648         4. ASCII should be designated to G0 at end of line?
1649         5. 7-bit environment or 8-bit environment?
1650         6. Use locking-shift?
1651         7. Use Single-shift?
1652    And the following two are only for Japanese:
1653         8. Use ASCII in place of JIS0201-1976-Roman?
1654         9. Use JISX0208-1983 in place of JISX0208-1978?
1655    These specifications are encoded in `coding->flags' as flag bits
1656    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1657    details.
1658 */
1659
1660 /* Produce codes (escape sequence) for designating CHARSET to graphic
1661    register REG at DST, and increment DST.  If <final-char> of CHARSET is
1662    '@', 'A', or 'B' and the coding system CODING allows, produce
1663    designation sequence of short-form.  */
1664
1665 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1666   do {                                                                  \
1667     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1668     char *intermediate_char_94 = "()*+";                                \
1669     char *intermediate_char_96 = ",-./";                                \
1670     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1671                                                                         \
1672     if (revision < 255)                                                 \
1673       {                                                                 \
1674         *dst++ = ISO_CODE_ESC;                                          \
1675         *dst++ = '&';                                                   \
1676         *dst++ = '@' + revision;                                        \
1677       }                                                                 \
1678     *dst++ = ISO_CODE_ESC;                                              \
1679     if (CHARSET_DIMENSION (charset) == 1)                               \
1680       {                                                                 \
1681         if (CHARSET_CHARS (charset) == 94)                              \
1682           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1683         else                                                            \
1684           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1685       }                                                                 \
1686     else                                                                \
1687       {                                                                 \
1688         *dst++ = '$';                                                   \
1689         if (CHARSET_CHARS (charset) == 94)                              \
1690           {                                                             \
1691             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1692                 || reg != 0                                             \
1693                 || final_char < '@' || final_char > 'B')                \
1694               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1695           }                                                             \
1696         else                                                            \
1697           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1698       }                                                                 \
1699     *dst++ = final_char;                                                \
1700     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1701   } while (0)
1702
1703 /* The following two macros produce codes (control character or escape
1704    sequence) for ISO2022 single-shift functions (single-shift-2 and
1705    single-shift-3).  */
1706
1707 #define ENCODE_SINGLE_SHIFT_2                           \
1708   do {                                                  \
1709     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1710       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1711     else                                                \
1712       *dst++ = ISO_CODE_SS2;                            \
1713     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1714   } while (0)
1715
1716 #define ENCODE_SINGLE_SHIFT_3                           \
1717   do {                                                  \
1718     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1719       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1720     else                                                \
1721       *dst++ = ISO_CODE_SS3;                            \
1722     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1723   } while (0)
1724
1725 /* The following four macros produce codes (control character or
1726    escape sequence) for ISO2022 locking-shift functions (shift-in,
1727    shift-out, locking-shift-2, and locking-shift-3).  */
1728
1729 #define ENCODE_SHIFT_IN                         \
1730   do {                                          \
1731     *dst++ = ISO_CODE_SI;                       \
1732     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1733   } while (0)
1734
1735 #define ENCODE_SHIFT_OUT                        \
1736   do {                                          \
1737     *dst++ = ISO_CODE_SO;                       \
1738     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1739   } while (0)
1740
1741 #define ENCODE_LOCKING_SHIFT_2                  \
1742   do {                                          \
1743     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1744     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1745   } while (0)
1746
1747 #define ENCODE_LOCKING_SHIFT_3                  \
1748   do {                                          \
1749     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1750     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1751   } while (0)
1752
1753 /* Produce codes for a DIMENSION1 character whose character set is
1754    CHARSET and whose position-code is C1.  Designation and invocation
1755    sequences are also produced in advance if necessary.  */
1756
1757 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1758   do {                                                                  \
1759     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1760       {                                                                 \
1761         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1762           *dst++ = c1 & 0x7F;                                           \
1763         else                                                            \
1764           *dst++ = c1 | 0x80;                                           \
1765         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1766         break;                                                          \
1767       }                                                                 \
1768     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1769       {                                                                 \
1770         *dst++ = c1 & 0x7F;                                             \
1771         break;                                                          \
1772       }                                                                 \
1773     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1774       {                                                                 \
1775         *dst++ = c1 | 0x80;                                             \
1776         break;                                                          \
1777       }                                                                 \
1778     else                                                                \
1779       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1780          must invoke it, or, at first, designate it to some graphic     \
1781          register.  Then repeat the loop to actually produce the        \
1782          character.  */                                                 \
1783       dst = encode_invocation_designation (charset, coding, dst);       \
1784   } while (1)
1785
1786 /* Produce codes for a DIMENSION2 character whose character set is
1787    CHARSET and whose position-codes are C1 and C2.  Designation and
1788    invocation codes are also produced in advance if necessary.  */
1789
1790 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1791   do {                                                                  \
1792     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1793       {                                                                 \
1794         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1795           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1796         else                                                            \
1797           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1798         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1799         break;                                                          \
1800       }                                                                 \
1801     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1802       {                                                                 \
1803         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1804         break;                                                          \
1805       }                                                                 \
1806     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1807       {                                                                 \
1808         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1809         break;                                                          \
1810       }                                                                 \
1811     else                                                                \
1812       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1813          must invoke it, or, at first, designate it to some graphic     \
1814          register.  Then repeat the loop to actually produce the        \
1815          character.  */                                                 \
1816       dst = encode_invocation_designation (charset, coding, dst);       \
1817   } while (1)
1818
1819 #define ENCODE_ISO_CHARACTER(c)                                 \
1820   do {                                                          \
1821     int charset, c1, c2;                                        \
1822                                                                 \
1823     SPLIT_CHAR (c, charset, c1, c2);                            \
1824     if (CHARSET_DEFINED_P (charset))                            \
1825       {                                                         \
1826         if (CHARSET_DIMENSION (charset) == 1)                   \
1827           {                                                     \
1828             if (charset == CHARSET_ASCII                        \
1829                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
1830               charset = charset_latin_jisx0201;                 \
1831             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
1832           }                                                     \
1833         else                                                    \
1834           {                                                     \
1835             if (charset == charset_jisx0208                     \
1836                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
1837               charset = charset_jisx0208_1978;                  \
1838             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
1839           }                                                     \
1840       }                                                         \
1841     else                                                        \
1842       {                                                         \
1843         *dst++ = c1;                                            \
1844         if (c2 >= 0)                                            \
1845           *dst++ = c2;                                          \
1846       }                                                         \
1847   } while (0)
1848
1849
1850 /* Instead of encoding character C, produce one or two `?'s.  */
1851
1852 #define ENCODE_UNSAFE_CHARACTER(c)                                      \
1853   do {                                                                  \
1854     ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);       \
1855     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                           \
1856       ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);     \
1857   } while (0)
1858
1859
1860 /* Produce designation and invocation codes at a place pointed by DST
1861    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1862    Return new DST.  */
1863
1864 unsigned char *
1865 encode_invocation_designation (charset, coding, dst)
1866      int charset;
1867      struct coding_system *coding;
1868      unsigned char *dst;
1869 {
1870   int reg;                      /* graphic register number */
1871
1872   /* At first, check designations.  */
1873   for (reg = 0; reg < 4; reg++)
1874     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1875       break;
1876
1877   if (reg >= 4)
1878     {
1879       /* CHARSET is not yet designated to any graphic registers.  */
1880       /* At first check the requested designation.  */
1881       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1882       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1883         /* Since CHARSET requests no special designation, designate it
1884            to graphic register 0.  */
1885         reg = 0;
1886
1887       ENCODE_DESIGNATION (charset, reg, coding);
1888     }
1889
1890   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1891       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1892     {
1893       /* Since the graphic register REG is not invoked to any graphic
1894          planes, invoke it to graphic plane 0.  */
1895       switch (reg)
1896         {
1897         case 0:                 /* graphic register 0 */
1898           ENCODE_SHIFT_IN;
1899           break;
1900
1901         case 1:                 /* graphic register 1 */
1902           ENCODE_SHIFT_OUT;
1903           break;
1904
1905         case 2:                 /* graphic register 2 */
1906           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1907             ENCODE_SINGLE_SHIFT_2;
1908           else
1909             ENCODE_LOCKING_SHIFT_2;
1910           break;
1911
1912         case 3:                 /* graphic register 3 */
1913           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1914             ENCODE_SINGLE_SHIFT_3;
1915           else
1916             ENCODE_LOCKING_SHIFT_3;
1917           break;
1918         }
1919     }
1920
1921   return dst;
1922 }
1923
1924 /* Produce 2-byte codes for encoded composition rule RULE.  */
1925
1926 #define ENCODE_COMPOSITION_RULE(rule)           \
1927   do {                                          \
1928     int gref, nref;                             \
1929     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1930     *dst++ = 32 + 81 + gref;                    \
1931     *dst++ = 32 + nref;                         \
1932   } while (0)
1933
1934 /* Produce codes for indicating the start of a composition sequence
1935    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1936    which specify information about the composition.  See the comment
1937    in coding.h for the format of DATA.  */
1938
1939 #define ENCODE_COMPOSITION_START(coding, data)                          \
1940   do {                                                                  \
1941     coding->composing = data[3];                                        \
1942     *dst++ = ISO_CODE_ESC;                                              \
1943     if (coding->composing == COMPOSITION_RELATIVE)                      \
1944       *dst++ = '0';                                                     \
1945     else                                                                \
1946       {                                                                 \
1947         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1948                   ? '3' : '4');                                         \
1949         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1950         coding->composition_rule_follows = 0;                           \
1951       }                                                                 \
1952   } while (0)
1953
1954 /* Produce codes for indicating the end of the current composition.  */
1955
1956 #define ENCODE_COMPOSITION_END(coding, data)                    \
1957   do {                                                          \
1958     *dst++ = ISO_CODE_ESC;                                      \
1959     *dst++ = '1';                                               \
1960     coding->cmp_data_start += data[0];                          \
1961     coding->composing = COMPOSITION_NO;                         \
1962     if (coding->cmp_data_start == coding->cmp_data->used        \
1963         && coding->cmp_data->next)                              \
1964       {                                                         \
1965         coding->cmp_data = coding->cmp_data->next;              \
1966         coding->cmp_data_start = 0;                             \
1967       }                                                         \
1968   } while (0)
1969
1970 /* Produce composition start sequence ESC 0.  Here, this sequence
1971    doesn't mean the start of a new composition but means that we have
1972    just produced components (alternate chars and composition rules) of
1973    the composition and the actual text follows in SRC.  */
1974
1975 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1976   do {                                          \
1977     *dst++ = ISO_CODE_ESC;                      \
1978     *dst++ = '0';                               \
1979     coding->composing = COMPOSITION_RELATIVE;   \
1980   } while (0)
1981
1982 /* The following three macros produce codes for indicating direction
1983    of text.  */
1984 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1985   do {                                                  \
1986     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1987       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1988     else                                                \
1989       *dst++ = ISO_CODE_CSI;                            \
1990   } while (0)
1991
1992 #define ENCODE_DIRECTION_R2L    \
1993   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1994
1995 #define ENCODE_DIRECTION_L2R    \
1996   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1997
1998 /* Produce codes for designation and invocation to reset the graphic
1999    planes and registers to initial state.  */
2000 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2001   do {                                                                      \
2002     int reg;                                                                \
2003     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2004       ENCODE_SHIFT_IN;                                                      \
2005     for (reg = 0; reg < 4; reg++)                                           \
2006       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2007           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2008               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2009         ENCODE_DESIGNATION                                                  \
2010           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2011   } while (0)
2012
2013 /* Produce designation sequences of charsets in the line started from
2014    SRC to a place pointed by DST, and return updated DST.
2015
2016    If the current block ends before any end-of-line, we may fail to
2017    find all the necessary designations.  */
2018
2019 static unsigned char *
2020 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2021      struct coding_system *coding;
2022      Lisp_Object translation_table;
2023      unsigned char *src, *src_end, *dst;
2024 {
2025   int charset, c, found = 0, reg;
2026   /* Table of charsets to be designated to each graphic register.  */
2027   int r[4];
2028
2029   for (reg = 0; reg < 4; reg++)
2030     r[reg] = -1;
2031
2032   while (found < 4)
2033     {
2034       ONE_MORE_CHAR (c);
2035       if (c == '\n')
2036         break;
2037
2038       charset = CHAR_CHARSET (c);
2039       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2040       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2041         {
2042           found++;
2043           r[reg] = charset;
2044         }
2045     }
2046
2047  label_end_of_loop:
2048   if (found)
2049     {
2050       for (reg = 0; reg < 4; reg++)
2051         if (r[reg] >= 0
2052             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2053           ENCODE_DESIGNATION (r[reg], reg, coding);
2054     }
2055
2056   return dst;
2057 }
2058
2059 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2060
2061 static void
2062 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2063      struct coding_system *coding;
2064      unsigned char *source, *destination;
2065      int src_bytes, dst_bytes;
2066 {
2067   unsigned char *src = source;
2068   unsigned char *src_end = source + src_bytes;
2069   unsigned char *dst = destination;
2070   unsigned char *dst_end = destination + dst_bytes;
2071   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2072      from DST_END to assure overflow checking is necessary only at the
2073      head of loop.  */
2074   unsigned char *adjusted_dst_end = dst_end - 19;
2075   /* SRC_BASE remembers the start position in source in each loop.
2076      The loop will be exited when there's not enough source text to
2077      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2078      there's not enough destination area to produce encoded codes
2079      (within macro EMIT_BYTES).  */
2080   unsigned char *src_base;
2081   int c;
2082   Lisp_Object translation_table;
2083   Lisp_Object safe_chars;
2084
2085   safe_chars = coding_safe_chars (coding);
2086
2087   if (NILP (Venable_character_translation))
2088     translation_table = Qnil;
2089   else
2090     {
2091       translation_table = coding->translation_table_for_encode;
2092       if (NILP (translation_table))
2093         translation_table = Vstandard_translation_table_for_encode;
2094     }
2095
2096   coding->consumed_char = 0;
2097   coding->errors = 0;
2098   while (1)
2099     {
2100       src_base = src;
2101
2102       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2103         {
2104           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2105           break;
2106         }
2107
2108       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2109           && CODING_SPEC_ISO_BOL (coding))
2110         {
2111           /* We have to produce designation sequences if any now.  */
2112           dst = encode_designation_at_bol (coding, translation_table,
2113                                            src, src_end, dst);
2114           CODING_SPEC_ISO_BOL (coding) = 0;
2115         }
2116
2117       /* Check composition start and end.  */
2118       if (coding->composing != COMPOSITION_DISABLED
2119           && coding->cmp_data_start < coding->cmp_data->used)
2120         {
2121           struct composition_data *cmp_data = coding->cmp_data;
2122           int *data = cmp_data->data + coding->cmp_data_start;
2123           int this_pos = cmp_data->char_offset + coding->consumed_char;
2124
2125           if (coding->composing == COMPOSITION_RELATIVE)
2126             {
2127               if (this_pos == data[2])
2128                 {
2129                   ENCODE_COMPOSITION_END (coding, data);
2130                   cmp_data = coding->cmp_data;
2131                   data = cmp_data->data + coding->cmp_data_start;
2132                 }
2133             }
2134           else if (COMPOSING_P (coding))
2135             {
2136               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2137               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2138                 /* We have consumed components of the composition.
2139                    What follows in SRC is the compositions's base
2140                    text.  */
2141                 ENCODE_COMPOSITION_FAKE_START (coding);
2142               else
2143                 {
2144                   int c = cmp_data->data[coding->cmp_data_index++];
2145                   if (coding->composition_rule_follows)
2146                     {
2147                       ENCODE_COMPOSITION_RULE (c);
2148                       coding->composition_rule_follows = 0;
2149                     }
2150                   else
2151                     {
2152                       if (coding->flags & CODING_FLAG_ISO_SAFE
2153                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2154                         ENCODE_UNSAFE_CHARACTER (c);
2155                       else
2156                         ENCODE_ISO_CHARACTER (c);
2157                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2158                         coding->composition_rule_follows = 1;
2159                     }
2160                   continue;
2161                 }
2162             }
2163           if (!COMPOSING_P (coding))
2164             {
2165               if (this_pos == data[1])
2166                 {
2167                   ENCODE_COMPOSITION_START (coding, data);
2168                   continue;
2169                 }
2170             }
2171         }
2172
2173       ONE_MORE_CHAR (c);
2174
2175       /* Now encode the character C.  */
2176       if (c < 0x20 || c == 0x7F)
2177         {
2178           if (c == '\r')
2179             {
2180               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2181                 {
2182                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2183                     ENCODE_RESET_PLANE_AND_REGISTER;
2184                   *dst++ = c;
2185                   continue;
2186                 }
2187               /* fall down to treat '\r' as '\n' ...  */
2188               c = '\n';
2189             }
2190           if (c == '\n')
2191             {
2192               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2193                 ENCODE_RESET_PLANE_AND_REGISTER;
2194               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2195                 bcopy (coding->spec.iso2022.initial_designation,
2196                        coding->spec.iso2022.current_designation,
2197                        sizeof coding->spec.iso2022.initial_designation);
2198               if (coding->eol_type == CODING_EOL_LF
2199                   || coding->eol_type == CODING_EOL_UNDECIDED)
2200                 *dst++ = ISO_CODE_LF;
2201               else if (coding->eol_type == CODING_EOL_CRLF)
2202                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2203               else
2204                 *dst++ = ISO_CODE_CR;
2205               CODING_SPEC_ISO_BOL (coding) = 1;
2206             }
2207           else
2208             {
2209               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2210                 ENCODE_RESET_PLANE_AND_REGISTER;
2211               *dst++ = c;
2212             }
2213         }
2214       else if (ASCII_BYTE_P (c))
2215         ENCODE_ISO_CHARACTER (c);
2216       else if (SINGLE_BYTE_CHAR_P (c))
2217         {
2218           *dst++ = c;
2219           coding->errors++;
2220         }
2221       else if (coding->flags & CODING_FLAG_ISO_SAFE
2222                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2223         ENCODE_UNSAFE_CHARACTER (c);
2224       else
2225         ENCODE_ISO_CHARACTER (c);
2226
2227       coding->consumed_char++;
2228     }
2229
2230  label_end_of_loop:
2231   coding->consumed = src_base - source;
2232   coding->produced = coding->produced_char = dst - destination;
2233 }
2234
2235 \f
2236 /*** 4. SJIS and BIG5 handlers ***/
2237
2238 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2239    quite widely.  So, for the moment, Emacs supports them in the bare
2240    C code.  But, in the future, they may be supported only by CCL.  */
2241
2242 /* SJIS is a coding system encoding three character sets: ASCII, right
2243    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2244    as is.  A character of charset katakana-jisx0201 is encoded by
2245    "position-code + 0x80".  A character of charset japanese-jisx0208
2246    is encoded in 2-byte but two position-codes are divided and shifted
2247    so that it fit in the range below.
2248
2249    --- CODE RANGE of SJIS ---
2250    (character set)      (range)
2251    ASCII                0x00 .. 0x7F
2252    KATAKANA-JISX0201    0xA0 .. 0xDF
2253    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2254             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2255    -------------------------------
2256
2257 */
2258
2259 /* BIG5 is a coding system encoding two character sets: ASCII and
2260    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2261    character set and is encoded in two-byte.
2262
2263    --- CODE RANGE of BIG5 ---
2264    (character set)      (range)
2265    ASCII                0x00 .. 0x7F
2266    Big5 (1st byte)      0xA1 .. 0xFE
2267         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2268    --------------------------
2269
2270    Since the number of characters in Big5 is larger than maximum
2271    characters in Emacs' charset (96x96), it can't be handled as one
2272    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2273    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2274    contains frequently used characters and the latter contains less
2275    frequently used characters.  */
2276
2277 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2278    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2279    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2280    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2281
2282 /* Number of Big5 characters which have the same code in 1st byte.  */
2283 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2284
2285 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2286   do {                                                                  \
2287     unsigned int temp                                                   \
2288       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2289     if (b1 < 0xC9)                                                      \
2290       charset = charset_big5_1;                                         \
2291     else                                                                \
2292       {                                                                 \
2293         charset = charset_big5_2;                                       \
2294         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2295       }                                                                 \
2296     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2297     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2298   } while (0)
2299
2300 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2301   do {                                                                  \
2302     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2303     if (charset == charset_big5_2)                                      \
2304       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2305     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2306     b2 = temp % BIG5_SAME_ROW;                                          \
2307     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2308   } while (0)
2309
2310 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2311    Check if a text is encoded in SJIS.  If it is, return
2312    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2313
2314 static int
2315 detect_coding_sjis (src, src_end, multibytep)
2316      unsigned char *src, *src_end;
2317      int multibytep;
2318 {
2319   int c;
2320   /* Dummy for ONE_MORE_BYTE.  */
2321   struct coding_system dummy_coding;
2322   struct coding_system *coding = &dummy_coding;
2323
2324   while (1)
2325     {
2326       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2327       if (c >= 0x81)
2328         {
2329           if (c <= 0x9F || (c >= 0xE0 && c <= 0xEF))
2330             {
2331               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2332               if (c < 0x40 || c == 0x7F || c > 0xFC)
2333                 return 0;
2334             }
2335           else if (c > 0xDF)
2336             return 0;
2337         }
2338     }
2339  label_end_of_loop:
2340   return CODING_CATEGORY_MASK_SJIS;
2341 }
2342
2343 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2344    Check if a text is encoded in BIG5.  If it is, return
2345    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2346
2347 static int
2348 detect_coding_big5 (src, src_end, multibytep)
2349      unsigned char *src, *src_end;
2350      int multibytep;
2351 {
2352   int c;
2353   /* Dummy for ONE_MORE_BYTE.  */
2354   struct coding_system dummy_coding;
2355   struct coding_system *coding = &dummy_coding;
2356
2357   while (1)
2358     {
2359       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2360       if (c >= 0xA1)
2361         {
2362           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2363           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2364             return 0;
2365         }
2366     }
2367  label_end_of_loop:
2368   return CODING_CATEGORY_MASK_BIG5;
2369 }
2370
2371 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2372    Check if a text is encoded in UTF-8.  If it is, return
2373    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2374
2375 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2376 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2377 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2378 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2379 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2380 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2381 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2382
2383 static int
2384 detect_coding_utf_8 (src, src_end, multibytep)
2385      unsigned char *src, *src_end;
2386      int multibytep;
2387 {
2388   unsigned char c;
2389   int seq_maybe_bytes;
2390   /* Dummy for ONE_MORE_BYTE.  */
2391   struct coding_system dummy_coding;
2392   struct coding_system *coding = &dummy_coding;
2393
2394   while (1)
2395     {
2396       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2397       if (UTF_8_1_OCTET_P (c))
2398         continue;
2399       else if (UTF_8_2_OCTET_LEADING_P (c))
2400         seq_maybe_bytes = 1;
2401       else if (UTF_8_3_OCTET_LEADING_P (c))
2402         seq_maybe_bytes = 2;
2403       else if (UTF_8_4_OCTET_LEADING_P (c))
2404         seq_maybe_bytes = 3;
2405       else if (UTF_8_5_OCTET_LEADING_P (c))
2406         seq_maybe_bytes = 4;
2407       else if (UTF_8_6_OCTET_LEADING_P (c))
2408         seq_maybe_bytes = 5;
2409       else
2410         return 0;
2411
2412       do
2413         {
2414           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2415           if (!UTF_8_EXTRA_OCTET_P (c))
2416             return 0;
2417           seq_maybe_bytes--;
2418         }
2419       while (seq_maybe_bytes > 0);
2420     }
2421
2422  label_end_of_loop:
2423   return CODING_CATEGORY_MASK_UTF_8;
2424 }
2425
2426 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2427    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2428    Little Endian (otherwise).  If it is, return
2429    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2430    else return 0.  */
2431
2432 #define UTF_16_INVALID_P(val)   \
2433   (((val) == 0xFFFE)            \
2434    || ((val) == 0xFFFF))
2435
2436 #define UTF_16_HIGH_SURROGATE_P(val) \
2437   (((val) & 0xD800) == 0xD800)
2438
2439 #define UTF_16_LOW_SURROGATE_P(val) \
2440   (((val) & 0xDC00) == 0xDC00)
2441
2442 static int
2443 detect_coding_utf_16 (src, src_end, multibytep)
2444      unsigned char *src, *src_end;
2445      int multibytep;
2446 {
2447   unsigned char c1, c2;
2448   /* Dummy for TWO_MORE_BYTES.  */
2449   struct coding_system dummy_coding;
2450   struct coding_system *coding = &dummy_coding;
2451
2452   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2453   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
2454
2455   if ((c1 == 0xFF) && (c2 == 0xFE))
2456     return CODING_CATEGORY_MASK_UTF_16_LE;
2457   else if ((c1 == 0xFE) && (c2 == 0xFF))
2458     return CODING_CATEGORY_MASK_UTF_16_BE;
2459
2460  label_end_of_loop:
2461   return 0;
2462 }
2463
2464 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2465    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2466
2467 static void
2468 decode_coding_sjis_big5 (coding, source, destination,
2469                          src_bytes, dst_bytes, sjis_p)
2470      struct coding_system *coding;
2471      unsigned char *source, *destination;
2472      int src_bytes, dst_bytes;
2473      int sjis_p;
2474 {
2475   unsigned char *src = source;
2476   unsigned char *src_end = source + src_bytes;
2477   unsigned char *dst = destination;
2478   unsigned char *dst_end = destination + dst_bytes;
2479   /* SRC_BASE remembers the start position in source in each loop.
2480      The loop will be exited when there's not enough source code
2481      (within macro ONE_MORE_BYTE), or when there's not enough
2482      destination area to produce a character (within macro
2483      EMIT_CHAR).  */
2484   unsigned char *src_base;
2485   Lisp_Object translation_table;
2486
2487   if (NILP (Venable_character_translation))
2488     translation_table = Qnil;
2489   else
2490     {
2491       translation_table = coding->translation_table_for_decode;
2492       if (NILP (translation_table))
2493         translation_table = Vstandard_translation_table_for_decode;
2494     }
2495
2496   coding->produced_char = 0;
2497   while (1)
2498     {
2499       int c, charset, c1, c2;
2500
2501       src_base = src;
2502       ONE_MORE_BYTE (c1);
2503
2504       if (c1 < 0x80)
2505         {
2506           charset = CHARSET_ASCII;
2507           if (c1 < 0x20)
2508             {
2509               if (c1 == '\r')
2510                 {
2511                   if (coding->eol_type == CODING_EOL_CRLF)
2512                     {
2513                       ONE_MORE_BYTE (c2);
2514                       if (c2 == '\n')
2515                         c1 = c2;
2516                       else if (coding->mode
2517                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2518                         {
2519                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2520                           goto label_end_of_loop;
2521                         }
2522                       else
2523                         /* To process C2 again, SRC is subtracted by 1.  */
2524                         src--;
2525                     }
2526                   else if (coding->eol_type == CODING_EOL_CR)
2527                     c1 = '\n';
2528                 }
2529               else if (c1 == '\n'
2530                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2531                        && (coding->eol_type == CODING_EOL_CR
2532                            || coding->eol_type == CODING_EOL_CRLF))
2533                 {
2534                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2535                   goto label_end_of_loop;
2536                 }
2537             }
2538         }
2539       else
2540         {
2541           if (sjis_p)
2542             {
2543               if (c1 >= 0xF0)
2544                 goto label_invalid_code;
2545               if (c1 < 0xA0 || c1 >= 0xE0)
2546                 {
2547                   /* SJIS -> JISX0208 */
2548                   ONE_MORE_BYTE (c2);
2549                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2550                     goto label_invalid_code;
2551                   DECODE_SJIS (c1, c2, c1, c2);
2552                   charset = charset_jisx0208;
2553                 }
2554               else
2555                 /* SJIS -> JISX0201-Kana */
2556                 charset = charset_katakana_jisx0201;
2557             }
2558           else
2559             {
2560               /* BIG5 -> Big5 */
2561               if (c1 < 0xA1 || c1 > 0xFE)
2562                 goto label_invalid_code;
2563               ONE_MORE_BYTE (c2);
2564               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2565                 goto label_invalid_code;
2566               DECODE_BIG5 (c1, c2, charset, c1, c2);
2567             }
2568         }
2569
2570       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2571       EMIT_CHAR (c);
2572       continue;
2573
2574     label_invalid_code:
2575       coding->errors++;
2576       src = src_base;
2577       c = *src++;
2578       EMIT_CHAR (c);
2579     }
2580
2581  label_end_of_loop:
2582   coding->consumed = coding->consumed_char = src_base - source;
2583   coding->produced = dst - destination;
2584   return;
2585 }
2586
2587 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2588    This function can encode charsets `ascii', `katakana-jisx0201',
2589    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2590    are sure that all these charsets are registered as official charset
2591    (i.e. do not have extended leading-codes).  Characters of other
2592    charsets are produced without any encoding.  If SJIS_P is 1, encode
2593    SJIS text, else encode BIG5 text.  */
2594
2595 static void
2596 encode_coding_sjis_big5 (coding, source, destination,
2597                          src_bytes, dst_bytes, sjis_p)
2598      struct coding_system *coding;
2599      unsigned char *source, *destination;
2600      int src_bytes, dst_bytes;
2601      int sjis_p;
2602 {
2603   unsigned char *src = source;
2604   unsigned char *src_end = source + src_bytes;
2605   unsigned char *dst = destination;
2606   unsigned char *dst_end = destination + dst_bytes;
2607   /* SRC_BASE remembers the start position in source in each loop.
2608      The loop will be exited when there's not enough source text to
2609      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2610      there's not enough destination area to produce encoded codes
2611      (within macro EMIT_BYTES).  */
2612   unsigned char *src_base;
2613   Lisp_Object translation_table;
2614
2615   if (NILP (Venable_character_translation))
2616     translation_table = Qnil;
2617   else
2618     {
2619       translation_table = coding->translation_table_for_encode;
2620       if (NILP (translation_table))
2621         translation_table = Vstandard_translation_table_for_encode;
2622     }
2623
2624   while (1)
2625     {
2626       int c, charset, c1, c2;
2627
2628       src_base = src;
2629       ONE_MORE_CHAR (c);
2630
2631       /* Now encode the character C.  */
2632       if (SINGLE_BYTE_CHAR_P (c))
2633         {
2634           switch (c)
2635             {
2636             case '\r':
2637               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2638                 {
2639                   EMIT_ONE_BYTE (c);
2640                   break;
2641                 }
2642               c = '\n';
2643             case '\n':
2644               if (coding->eol_type == CODING_EOL_CRLF)
2645                 {
2646                   EMIT_TWO_BYTES ('\r', c);
2647                   break;
2648                 }
2649               else if (coding->eol_type == CODING_EOL_CR)
2650                 c = '\r';
2651             default:
2652               EMIT_ONE_BYTE (c);
2653             }
2654         }
2655       else
2656         {
2657           SPLIT_CHAR (c, charset, c1, c2);
2658           if (sjis_p)
2659             {
2660               if (charset == charset_jisx0208
2661                   || charset == charset_jisx0208_1978)
2662                 {
2663                   ENCODE_SJIS (c1, c2, c1, c2);
2664                   EMIT_TWO_BYTES (c1, c2);
2665                 }
2666               else if (charset == charset_katakana_jisx0201)
2667                 EMIT_ONE_BYTE (c1 | 0x80);
2668               else if (charset == charset_latin_jisx0201)
2669                 EMIT_ONE_BYTE (c1);
2670               else
2671                 /* There's no way other than producing the internal
2672                    codes as is.  */
2673                 EMIT_BYTES (src_base, src);
2674             }
2675           else
2676             {
2677               if (charset == charset_big5_1 || charset == charset_big5_2)
2678                 {
2679                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
2680                   EMIT_TWO_BYTES (c1, c2);
2681                 }
2682               else
2683                 /* There's no way other than producing the internal
2684                    codes as is.  */
2685                 EMIT_BYTES (src_base, src);
2686             }
2687         }
2688       coding->consumed_char++;
2689     }
2690
2691  label_end_of_loop:
2692   coding->consumed = src_base - source;
2693   coding->produced = coding->produced_char = dst - destination;
2694 }
2695
2696 \f
2697 /*** 5. CCL handlers ***/
2698
2699 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2700    Check if a text is encoded in a coding system of which
2701    encoder/decoder are written in CCL program.  If it is, return
2702    CODING_CATEGORY_MASK_CCL, else return 0.  */
2703
2704 static int
2705 detect_coding_ccl (src, src_end, multibytep)
2706      unsigned char *src, *src_end;
2707      int multibytep;
2708 {
2709   unsigned char *valid;
2710   int c;
2711   /* Dummy for ONE_MORE_BYTE.  */
2712   struct coding_system dummy_coding;
2713   struct coding_system *coding = &dummy_coding;
2714
2715   /* No coding system is assigned to coding-category-ccl.  */
2716   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2717     return 0;
2718
2719   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2720   while (1)
2721     {
2722       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2723       if (! valid[c])
2724         return 0;
2725     }
2726  label_end_of_loop:
2727   return CODING_CATEGORY_MASK_CCL;
2728 }
2729
2730 \f
2731 /*** 6. End-of-line handlers ***/
2732
2733 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2734
2735 static void
2736 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2737      struct coding_system *coding;
2738      unsigned char *source, *destination;
2739      int src_bytes, dst_bytes;
2740 {
2741   unsigned char *src = source;
2742   unsigned char *dst = destination;
2743   unsigned char *src_end = src + src_bytes;
2744   unsigned char *dst_end = dst + dst_bytes;
2745   Lisp_Object translation_table;
2746   /* SRC_BASE remembers the start position in source in each loop.
2747      The loop will be exited when there's not enough source code
2748      (within macro ONE_MORE_BYTE), or when there's not enough
2749      destination area to produce a character (within macro
2750      EMIT_CHAR).  */
2751   unsigned char *src_base;
2752   int c;
2753
2754   translation_table = Qnil;
2755   switch (coding->eol_type)
2756     {
2757     case CODING_EOL_CRLF:
2758       while (1)
2759         {
2760           src_base = src;
2761           ONE_MORE_BYTE (c);
2762           if (c == '\r')
2763             {
2764               ONE_MORE_BYTE (c);
2765               if (c != '\n')
2766                 {
2767                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2768                     {
2769                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
2770                       goto label_end_of_loop;
2771                     }
2772                   src--;
2773                   c = '\r';
2774                 }
2775             }
2776           else if (c == '\n'
2777                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2778             {
2779               coding->result = CODING_FINISH_INCONSISTENT_EOL;
2780               goto label_end_of_loop;
2781             }
2782           EMIT_CHAR (c);
2783         }
2784       break;
2785
2786     case CODING_EOL_CR:
2787       while (1)
2788         {
2789           src_base = src;
2790           ONE_MORE_BYTE (c);
2791           if (c == '\n')
2792             {
2793               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2794                 {
2795                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2796                   goto label_end_of_loop;
2797                 }
2798             }
2799           else if (c == '\r')
2800             c = '\n';
2801           EMIT_CHAR (c);
2802         }
2803       break;
2804
2805     default:                    /* no need for EOL handling */
2806       while (1)
2807         {
2808           src_base = src;
2809           ONE_MORE_BYTE (c);
2810           EMIT_CHAR (c);
2811         }
2812     }
2813
2814  label_end_of_loop:
2815   coding->consumed = coding->consumed_char = src_base - source;
2816   coding->produced = dst - destination;
2817   return;
2818 }
2819
2820 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2821    format of end-of-line according to `coding->eol_type'.  It also
2822    convert multibyte form 8-bit characers to unibyte if
2823    CODING->src_multibyte is nonzero.  If `coding->mode &
2824    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2825    also means end-of-line.  */
2826
2827 static void
2828 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2829      struct coding_system *coding;
2830      unsigned char *source, *destination;
2831      int src_bytes, dst_bytes;
2832 {
2833   unsigned char *src = source;
2834   unsigned char *dst = destination;
2835   unsigned char *src_end = src + src_bytes;
2836   unsigned char *dst_end = dst + dst_bytes;
2837   Lisp_Object translation_table;
2838   /* SRC_BASE remembers the start position in source in each loop.
2839      The loop will be exited when there's not enough source text to
2840      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2841      there's not enough destination area to produce encoded codes
2842      (within macro EMIT_BYTES).  */
2843   unsigned char *src_base;
2844   int c;
2845   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2846
2847   translation_table = Qnil;
2848   if (coding->src_multibyte
2849       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2850     {
2851       src_end--;
2852       src_bytes--;
2853       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2854     }
2855
2856   if (coding->eol_type == CODING_EOL_CRLF)
2857     {
2858       while (src < src_end)
2859         {
2860           src_base = src;
2861           c = *src++;
2862           if (c >= 0x20)
2863             EMIT_ONE_BYTE (c);
2864           else if (c == '\n' || (c == '\r' && selective_display))
2865             EMIT_TWO_BYTES ('\r', '\n');
2866           else
2867             EMIT_ONE_BYTE (c);
2868         }
2869       src_base = src;
2870     label_end_of_loop:
2871       ;
2872     }
2873   else
2874     {
2875       if (!dst_bytes || src_bytes <= dst_bytes)
2876         {
2877           safe_bcopy (src, dst, src_bytes);
2878           src_base = src_end;
2879           dst += src_bytes;
2880         }
2881       else
2882         {
2883           if (coding->src_multibyte
2884               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2885             dst_bytes--;
2886           safe_bcopy (src, dst, dst_bytes);
2887           src_base = src + dst_bytes;
2888           dst = destination + dst_bytes;
2889           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2890         }
2891       if (coding->eol_type == CODING_EOL_CR)
2892         {
2893           for (src = destination; src < dst; src++)
2894             if (*src == '\n') *src = '\r';
2895         }
2896       else if (selective_display)
2897         {
2898           for (src = destination; src < dst; src++)
2899             if (*src == '\r') *src = '\n';
2900         }
2901     }
2902   if (coding->src_multibyte)
2903     dst = destination + str_as_unibyte (destination, dst - destination);
2904
2905   coding->consumed = src_base - source;
2906   coding->produced = dst - destination;
2907   coding->produced_char = coding->produced;
2908 }
2909
2910 \f
2911 /*** 7. C library functions ***/
2912
2913 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2914    has a property `coding-system'.  The value of this property is a
2915    vector of length 5 (called as coding-vector).  Among elements of
2916    this vector, the first (element[0]) and the fifth (element[4])
2917    carry important information for decoding/encoding.  Before
2918    decoding/encoding, this information should be set in fields of a
2919    structure of type `coding_system'.
2920
2921    A value of property `coding-system' can be a symbol of another
2922    subsidiary coding-system.  In that case, Emacs gets coding-vector
2923    from that symbol.
2924
2925    `element[0]' contains information to be set in `coding->type'.  The
2926    value and its meaning is as follows:
2927
2928    0 -- coding_type_emacs_mule
2929    1 -- coding_type_sjis
2930    2 -- coding_type_iso2022
2931    3 -- coding_type_big5
2932    4 -- coding_type_ccl encoder/decoder written in CCL
2933    nil -- coding_type_no_conversion
2934    t -- coding_type_undecided (automatic conversion on decoding,
2935                                no-conversion on encoding)
2936
2937    `element[4]' contains information to be set in `coding->flags' and
2938    `coding->spec'.  The meaning varies by `coding->type'.
2939
2940    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2941    of length 32 (of which the first 13 sub-elements are used now).
2942    Meanings of these sub-elements are:
2943
2944    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2945         If the value is an integer of valid charset, the charset is
2946         assumed to be designated to graphic register N initially.
2947
2948         If the value is minus, it is a minus value of charset which
2949         reserves graphic register N, which means that the charset is
2950         not designated initially but should be designated to graphic
2951         register N just before encoding a character in that charset.
2952
2953         If the value is nil, graphic register N is never used on
2954         encoding.
2955
2956    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2957         Each value takes t or nil.  See the section ISO2022 of
2958         `coding.h' for more information.
2959
2960    If `coding->type' is `coding_type_big5', element[4] is t to denote
2961    BIG5-ETen or nil to denote BIG5-HKU.
2962
2963    If `coding->type' takes the other value, element[4] is ignored.
2964
2965    Emacs Lisp's coding system also carries information about format of
2966    end-of-line in a value of property `eol-type'.  If the value is
2967    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2968    means CODING_EOL_CR.  If it is not integer, it should be a vector
2969    of subsidiary coding systems of which property `eol-type' has one
2970    of above values.
2971
2972 */
2973
2974 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2975    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2976    is setup so that no conversion is necessary and return -1, else
2977    return 0.  */
2978
2979 int
2980 setup_coding_system (coding_system, coding)
2981      Lisp_Object coding_system;
2982      struct coding_system *coding;
2983 {
2984   Lisp_Object coding_spec, coding_type, eol_type, plist;
2985   Lisp_Object val;
2986   int i;
2987
2988   /* At first, zero clear all members.  */
2989   bzero (coding, sizeof (struct coding_system));
2990
2991   /* Initialize some fields required for all kinds of coding systems.  */
2992   coding->symbol = coding_system;
2993   coding->heading_ascii = -1;
2994   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2995   coding->composing = COMPOSITION_DISABLED;
2996   coding->cmp_data = NULL;
2997
2998   if (NILP (coding_system))
2999     goto label_invalid_coding_system;
3000
3001   coding_spec = Fget (coding_system, Qcoding_system);
3002
3003   if (!VECTORP (coding_spec)
3004       || XVECTOR (coding_spec)->size != 5
3005       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3006     goto label_invalid_coding_system;
3007
3008   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3009   if (VECTORP (eol_type))
3010     {
3011       coding->eol_type = CODING_EOL_UNDECIDED;
3012       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3013     }
3014   else if (XFASTINT (eol_type) == 1)
3015     {
3016       coding->eol_type = CODING_EOL_CRLF;
3017       coding->common_flags
3018         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3019     }
3020   else if (XFASTINT (eol_type) == 2)
3021     {
3022       coding->eol_type = CODING_EOL_CR;
3023       coding->common_flags
3024         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3025     }
3026   else
3027     coding->eol_type = CODING_EOL_LF;
3028
3029   coding_type = XVECTOR (coding_spec)->contents[0];
3030   /* Try short cut.  */
3031   if (SYMBOLP (coding_type))
3032     {
3033       if (EQ (coding_type, Qt))
3034         {
3035           coding->type = coding_type_undecided;
3036           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3037         }
3038       else
3039         coding->type = coding_type_no_conversion;
3040       /* Initialize this member.  Any thing other than
3041          CODING_CATEGORY_IDX_UTF_16_BE and
3042          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3043          special treatment in detect_eol.  */
3044       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3045
3046       return 0;
3047     }
3048
3049   /* Get values of coding system properties:
3050      `post-read-conversion', `pre-write-conversion',
3051      `translation-table-for-decode', `translation-table-for-encode'.  */
3052   plist = XVECTOR (coding_spec)->contents[3];
3053   /* Pre & post conversion functions should be disabled if
3054      inhibit_eol_conversion is nozero.  This is the case that a code
3055      conversion function is called while those functions are running.  */
3056   if (! inhibit_pre_post_conversion)
3057     {
3058       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3059       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3060     }
3061   val = Fplist_get (plist, Qtranslation_table_for_decode);
3062   if (SYMBOLP (val))
3063     val = Fget (val, Qtranslation_table_for_decode);
3064   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3065   val = Fplist_get (plist, Qtranslation_table_for_encode);
3066   if (SYMBOLP (val))
3067     val = Fget (val, Qtranslation_table_for_encode);
3068   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3069   val = Fplist_get (plist, Qcoding_category);
3070   if (!NILP (val))
3071     {
3072       val = Fget (val, Qcoding_category_index);
3073       if (INTEGERP (val))
3074         coding->category_idx = XINT (val);
3075       else
3076         goto label_invalid_coding_system;
3077     }
3078   else
3079     goto label_invalid_coding_system;
3080
3081   /* If the coding system has non-nil `composition' property, enable
3082      composition handling.  */
3083   val = Fplist_get (plist, Qcomposition);
3084   if (!NILP (val))
3085     coding->composing = COMPOSITION_NO;
3086
3087   switch (XFASTINT (coding_type))
3088     {
3089     case 0:
3090       coding->type = coding_type_emacs_mule;
3091       if (!NILP (coding->post_read_conversion))
3092         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3093       if (!NILP (coding->pre_write_conversion))
3094         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3095       break;
3096
3097     case 1:
3098       coding->type = coding_type_sjis;
3099       coding->common_flags
3100         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3101       break;
3102
3103     case 2:
3104       coding->type = coding_type_iso2022;
3105       coding->common_flags
3106         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3107       {
3108         Lisp_Object val, temp;
3109         Lisp_Object *flags;
3110         int i, charset, reg_bits = 0;
3111
3112         val = XVECTOR (coding_spec)->contents[4];
3113
3114         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3115           goto label_invalid_coding_system;
3116
3117         flags = XVECTOR (val)->contents;
3118         coding->flags
3119           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3120              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3121              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3122              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3123              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3124              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3125              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3126              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3127              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3128              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3129              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3130              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3131              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3132              );
3133
3134         /* Invoke graphic register 0 to plane 0.  */
3135         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3136         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3137         CODING_SPEC_ISO_INVOCATION (coding, 1)
3138           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3139         /* Not single shifting at first.  */
3140         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3141         /* Beginning of buffer should also be regarded as bol. */
3142         CODING_SPEC_ISO_BOL (coding) = 1;
3143
3144         for (charset = 0; charset <= MAX_CHARSET; charset++)
3145           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3146         val = Vcharset_revision_alist;
3147         while (CONSP (val))
3148           {
3149             charset = get_charset_id (Fcar_safe (XCAR (val)));
3150             if (charset >= 0
3151                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3152                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3153               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3154             val = XCDR (val);
3155           }
3156
3157         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3158            FLAGS[REG] can be one of below:
3159                 integer CHARSET: CHARSET occupies register I,
3160                 t: designate nothing to REG initially, but can be used
3161                   by any charsets,
3162                 list of integer, nil, or t: designate the first
3163                   element (if integer) to REG initially, the remaining
3164                   elements (if integer) is designated to REG on request,
3165                   if an element is t, REG can be used by any charsets,
3166                 nil: REG is never used.  */
3167         for (charset = 0; charset <= MAX_CHARSET; charset++)
3168           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3169             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3170         for (i = 0; i < 4; i++)
3171           {
3172             if (INTEGERP (flags[i])
3173                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3174                 || (charset = get_charset_id (flags[i])) >= 0)
3175               {
3176                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3177                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3178               }
3179             else if (EQ (flags[i], Qt))
3180               {
3181                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3182                 reg_bits |= 1 << i;
3183                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3184               }
3185             else if (CONSP (flags[i]))
3186               {
3187                 Lisp_Object tail;
3188                 tail = flags[i];
3189
3190                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3191                 if (INTEGERP (XCAR (tail))
3192                     && (charset = XINT (XCAR (tail)),
3193                         CHARSET_VALID_P (charset))
3194                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3195                   {
3196                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3197                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3198                   }
3199                 else
3200                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3201                 tail = XCDR (tail);
3202                 while (CONSP (tail))
3203                   {
3204                     if (INTEGERP (XCAR (tail))
3205                         && (charset = XINT (XCAR (tail)),
3206                             CHARSET_VALID_P (charset))
3207                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3208                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3209                         = i;
3210                     else if (EQ (XCAR (tail), Qt))
3211                       reg_bits |= 1 << i;
3212                     tail = XCDR (tail);
3213                   }
3214               }
3215             else
3216               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3217
3218             CODING_SPEC_ISO_DESIGNATION (coding, i)
3219               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3220           }
3221
3222         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3223           {
3224             /* REG 1 can be used only by locking shift in 7-bit env.  */
3225             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3226               reg_bits &= ~2;
3227             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3228               /* Without any shifting, only REG 0 and 1 can be used.  */
3229               reg_bits &= 3;
3230           }
3231
3232         if (reg_bits)
3233           for (charset = 0; charset <= MAX_CHARSET; charset++)
3234             {
3235               if (CHARSET_VALID_P (charset)
3236                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3237                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3238                 {
3239                   /* There exist some default graphic registers to be
3240                      used by CHARSET.  */
3241
3242                   /* We had better avoid designating a charset of
3243                      CHARS96 to REG 0 as far as possible.  */
3244                   if (CHARSET_CHARS (charset) == 96)
3245                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3246                       = (reg_bits & 2
3247                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3248                   else
3249                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3250                       = (reg_bits & 1
3251                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3252                 }
3253             }
3254       }
3255       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3256       coding->spec.iso2022.last_invalid_designation_register = -1;
3257       break;
3258
3259     case 3:
3260       coding->type = coding_type_big5;
3261       coding->common_flags
3262         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3263       coding->flags
3264         = (NILP (XVECTOR (coding_spec)->contents[4])
3265            ? CODING_FLAG_BIG5_HKU
3266            : CODING_FLAG_BIG5_ETEN);
3267       break;
3268
3269     case 4:
3270       coding->type = coding_type_ccl;
3271       coding->common_flags
3272         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3273       {
3274         val = XVECTOR (coding_spec)->contents[4];
3275         if (! CONSP (val)
3276             || setup_ccl_program (&(coding->spec.ccl.decoder),
3277                                   XCAR (val)) < 0
3278             || setup_ccl_program (&(coding->spec.ccl.encoder),
3279                                   XCDR (val)) < 0)
3280           goto label_invalid_coding_system;
3281
3282         bzero (coding->spec.ccl.valid_codes, 256);
3283         val = Fplist_get (plist, Qvalid_codes);
3284         if (CONSP (val))
3285           {
3286             Lisp_Object this;
3287
3288             for (; CONSP (val); val = XCDR (val))
3289               {
3290                 this = XCAR (val);
3291                 if (INTEGERP (this)
3292                     && XINT (this) >= 0 && XINT (this) < 256)
3293                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3294                 else if (CONSP (this)
3295                          && INTEGERP (XCAR (this))
3296                          && INTEGERP (XCDR (this)))
3297                   {
3298                     int start = XINT (XCAR (this));
3299                     int end = XINT (XCDR (this));
3300
3301                     if (start >= 0 && start <= end && end < 256)
3302                       while (start <= end)
3303                         coding->spec.ccl.valid_codes[start++] = 1;
3304                   }
3305               }
3306           }
3307       }
3308       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3309       coding->spec.ccl.cr_carryover = 0;
3310       break;
3311
3312     case 5:
3313       coding->type = coding_type_raw_text;
3314       break;
3315
3316     default:
3317       goto label_invalid_coding_system;
3318     }
3319   return 0;
3320
3321  label_invalid_coding_system:
3322   coding->type = coding_type_no_conversion;
3323   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3324   coding->common_flags = 0;
3325   coding->eol_type = CODING_EOL_LF;
3326   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3327   return -1;
3328 }
3329
3330 /* Free memory blocks allocated for storing composition information.  */
3331
3332 void
3333 coding_free_composition_data (coding)
3334      struct coding_system *coding;
3335 {
3336   struct composition_data *cmp_data = coding->cmp_data, *next;
3337
3338   if (!cmp_data)
3339     return;
3340   /* Memory blocks are chained.  At first, rewind to the first, then,
3341      free blocks one by one.  */
3342   while (cmp_data->prev)
3343     cmp_data = cmp_data->prev;
3344   while (cmp_data)
3345     {
3346       next = cmp_data->next;
3347       xfree (cmp_data);
3348       cmp_data = next;
3349     }
3350   coding->cmp_data = NULL;
3351 }
3352
3353 /* Set `char_offset' member of all memory blocks pointed by
3354    coding->cmp_data to POS.  */
3355
3356 void
3357 coding_adjust_composition_offset (coding, pos)
3358      struct coding_system *coding;
3359      int pos;
3360 {
3361   struct composition_data *cmp_data;
3362
3363   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3364     cmp_data->char_offset = pos;
3365 }
3366
3367 /* Setup raw-text or one of its subsidiaries in the structure
3368    coding_system CODING according to the already setup value eol_type
3369    in CODING.  CODING should be setup for some coding system in
3370    advance.  */
3371
3372 void
3373 setup_raw_text_coding_system (coding)
3374      struct coding_system *coding;
3375 {
3376   if (coding->type != coding_type_raw_text)
3377     {
3378       coding->symbol = Qraw_text;
3379       coding->type = coding_type_raw_text;
3380       if (coding->eol_type != CODING_EOL_UNDECIDED)
3381         {
3382           Lisp_Object subsidiaries;
3383           subsidiaries = Fget (Qraw_text, Qeol_type);
3384
3385           if (VECTORP (subsidiaries)
3386               && XVECTOR (subsidiaries)->size == 3)
3387             coding->symbol
3388               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3389         }
3390       setup_coding_system (coding->symbol, coding);
3391     }
3392   return;
3393 }
3394
3395 /* Emacs has a mechanism to automatically detect a coding system if it
3396    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3397    it's impossible to distinguish some coding systems accurately
3398    because they use the same range of codes.  So, at first, coding
3399    systems are categorized into 7, those are:
3400
3401    o coding-category-emacs-mule
3402
3403         The category for a coding system which has the same code range
3404         as Emacs' internal format.  Assigned the coding-system (Lisp
3405         symbol) `emacs-mule' by default.
3406
3407    o coding-category-sjis
3408
3409         The category for a coding system which has the same code range
3410         as SJIS.  Assigned the coding-system (Lisp
3411         symbol) `japanese-shift-jis' by default.
3412
3413    o coding-category-iso-7
3414
3415         The category for a coding system which has the same code range
3416         as ISO2022 of 7-bit environment.  This doesn't use any locking
3417         shift and single shift functions.  This can encode/decode all
3418         charsets.  Assigned the coding-system (Lisp symbol)
3419         `iso-2022-7bit' by default.
3420
3421    o coding-category-iso-7-tight
3422
3423         Same as coding-category-iso-7 except that this can
3424         encode/decode only the specified charsets.
3425
3426    o coding-category-iso-8-1
3427
3428         The category for a coding system which has the same code range
3429         as ISO2022 of 8-bit environment and graphic plane 1 used only
3430         for DIMENSION1 charset.  This doesn't use any locking shift
3431         and single shift functions.  Assigned the coding-system (Lisp
3432         symbol) `iso-latin-1' by default.
3433
3434    o coding-category-iso-8-2
3435
3436         The category for a coding system which has the same code range
3437         as ISO2022 of 8-bit environment and graphic plane 1 used only
3438         for DIMENSION2 charset.  This doesn't use any locking shift
3439         and single shift functions.  Assigned the coding-system (Lisp
3440         symbol) `japanese-iso-8bit' by default.
3441
3442    o coding-category-iso-7-else
3443
3444         The category for a coding system which has the same code range
3445         as ISO2022 of 7-bit environemnt but uses locking shift or
3446         single shift functions.  Assigned the coding-system (Lisp
3447         symbol) `iso-2022-7bit-lock' by default.
3448
3449    o coding-category-iso-8-else
3450
3451         The category for a coding system which has the same code range
3452         as ISO2022 of 8-bit environemnt but uses locking shift or
3453         single shift functions.  Assigned the coding-system (Lisp
3454         symbol) `iso-2022-8bit-ss2' by default.
3455
3456    o coding-category-big5
3457
3458         The category for a coding system which has the same code range
3459         as BIG5.  Assigned the coding-system (Lisp symbol)
3460         `cn-big5' by default.
3461
3462    o coding-category-utf-8
3463
3464         The category for a coding system which has the same code range
3465         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3466         symbol) `utf-8' by default.
3467
3468    o coding-category-utf-16-be
3469
3470         The category for a coding system in which a text has an
3471         Unicode signature (cf. Unicode Standard) in the order of BIG
3472         endian at the head.  Assigned the coding-system (Lisp symbol)
3473         `utf-16-be' by default.
3474
3475    o coding-category-utf-16-le
3476
3477         The category for a coding system in which a text has an
3478         Unicode signature (cf. Unicode Standard) in the order of
3479         LITTLE endian at the head.  Assigned the coding-system (Lisp
3480         symbol) `utf-16-le' by default.
3481
3482    o coding-category-ccl
3483
3484         The category for a coding system of which encoder/decoder is
3485         written in CCL programs.  The default value is nil, i.e., no
3486         coding system is assigned.
3487
3488    o coding-category-binary
3489
3490         The category for a coding system not categorized in any of the
3491         above.  Assigned the coding-system (Lisp symbol)
3492         `no-conversion' by default.
3493
3494    Each of them is a Lisp symbol and the value is an actual
3495    `coding-system's (this is also a Lisp symbol) assigned by a user.
3496    What Emacs does actually is to detect a category of coding system.
3497    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3498    decide only one possible category, it selects a category of the
3499    highest priority.  Priorities of categories are also specified by a
3500    user in a Lisp variable `coding-category-list'.
3501
3502 */
3503
3504 static
3505 int ascii_skip_code[256];
3506
3507 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3508    If it detects possible coding systems, return an integer in which
3509    appropriate flag bits are set.  Flag bits are defined by macros
3510    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3511    it should point the table `coding_priorities'.  In that case, only
3512    the flag bit for a coding system of the highest priority is set in
3513    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
3514    range 0x80..0x9F are in multibyte form.
3515
3516    How many ASCII characters are at the head is returned as *SKIP.  */
3517
3518 static int
3519 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
3520      unsigned char *source;
3521      int src_bytes, *priorities, *skip;
3522      int multibytep;
3523 {
3524   register unsigned char c;
3525   unsigned char *src = source, *src_end = source + src_bytes;
3526   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3527   int i, idx;
3528
3529   /* At first, skip all ASCII characters and control characters except
3530      for three ISO2022 specific control characters.  */
3531   ascii_skip_code[ISO_CODE_SO] = 0;
3532   ascii_skip_code[ISO_CODE_SI] = 0;
3533   ascii_skip_code[ISO_CODE_ESC] = 0;
3534
3535  label_loop_detect_coding:
3536   while (src < src_end && ascii_skip_code[*src]) src++;
3537   *skip = src - source;
3538
3539   if (src >= src_end)
3540     /* We found nothing other than ASCII.  There's nothing to do.  */
3541     return 0;
3542
3543   c = *src;
3544   /* The text seems to be encoded in some multilingual coding system.
3545      Now, try to find in which coding system the text is encoded.  */
3546   if (c < 0x80)
3547     {
3548       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3549       /* C is an ISO2022 specific control code of C0.  */
3550       mask = detect_coding_iso2022 (src, src_end, multibytep);
3551       if (mask == 0)
3552         {
3553           /* No valid ISO2022 code follows C.  Try again.  */
3554           src++;
3555           if (c == ISO_CODE_ESC)
3556             ascii_skip_code[ISO_CODE_ESC] = 1;
3557           else
3558             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3559           goto label_loop_detect_coding;
3560         }
3561       if (priorities)
3562         {
3563           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3564             {
3565               if (mask & priorities[i])
3566                 return priorities[i];
3567             }
3568           return CODING_CATEGORY_MASK_RAW_TEXT;
3569         }
3570     }
3571   else
3572     {
3573       int try;
3574
3575       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
3576         c = *src++ - 0x20;
3577
3578       if (c < 0xA0)
3579         {
3580           /* C is the first byte of SJIS character code,
3581              or a leading-code of Emacs' internal format (emacs-mule),
3582              or the first byte of UTF-16.  */
3583           try = (CODING_CATEGORY_MASK_SJIS
3584                   | CODING_CATEGORY_MASK_EMACS_MULE
3585                   | CODING_CATEGORY_MASK_UTF_16_BE
3586                   | CODING_CATEGORY_MASK_UTF_16_LE);
3587
3588           /* Or, if C is a special latin extra code,
3589              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3590              or is an ISO2022 control-sequence-introducer (CSI),
3591              we should also consider the possibility of ISO2022 codings.  */
3592           if ((VECTORP (Vlatin_extra_code_table)
3593                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3594               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3595               || (c == ISO_CODE_CSI
3596                   && (src < src_end
3597                       && (*src == ']'
3598                           || ((*src == '0' || *src == '1' || *src == '2')
3599                               && src + 1 < src_end
3600                               && src[1] == ']')))))
3601             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3602                      | CODING_CATEGORY_MASK_ISO_8BIT);
3603         }
3604       else
3605         /* C is a character of ISO2022 in graphic plane right,
3606            or a SJIS's 1-byte character code (i.e. JISX0201),
3607            or the first byte of BIG5's 2-byte code,
3608            or the first byte of UTF-8/16.  */
3609         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3610                 | CODING_CATEGORY_MASK_ISO_8BIT
3611                 | CODING_CATEGORY_MASK_SJIS
3612                 | CODING_CATEGORY_MASK_BIG5
3613                 | CODING_CATEGORY_MASK_UTF_8
3614                 | CODING_CATEGORY_MASK_UTF_16_BE
3615                 | CODING_CATEGORY_MASK_UTF_16_LE);
3616
3617       /* Or, we may have to consider the possibility of CCL.  */
3618       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3619           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3620               ->spec.ccl.valid_codes)[c])
3621         try |= CODING_CATEGORY_MASK_CCL;
3622
3623       mask = 0;
3624       utf16_examined_p = iso2022_examined_p = 0;
3625       if (priorities)
3626         {
3627           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3628             {
3629               if (!iso2022_examined_p
3630                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3631                 {
3632                   mask |= detect_coding_iso2022 (src, src_end);
3633                   iso2022_examined_p = 1;
3634                 }
3635               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3636                 mask |= detect_coding_sjis (src, src_end, multibytep);
3637               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3638                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
3639               else if (!utf16_examined_p
3640                        && (priorities[i] & try &
3641                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3642                 {
3643                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
3644                   utf16_examined_p = 1;
3645                 }
3646               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3647                 mask |= detect_coding_big5 (src, src_end, multibytep);
3648               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3649                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
3650               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3651                 mask |= detect_coding_ccl (src, src_end, multibytep);
3652               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3653                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3654               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3655                 mask |= CODING_CATEGORY_MASK_BINARY;
3656               if (mask & priorities[i])
3657                 return priorities[i];
3658             }
3659           return CODING_CATEGORY_MASK_RAW_TEXT;
3660         }
3661       if (try & CODING_CATEGORY_MASK_ISO)
3662         mask |= detect_coding_iso2022 (src, src_end, multibytep);
3663       if (try & CODING_CATEGORY_MASK_SJIS)
3664         mask |= detect_coding_sjis (src, src_end, multibytep);
3665       if (try & CODING_CATEGORY_MASK_BIG5)
3666         mask |= detect_coding_big5 (src, src_end, multibytep);
3667       if (try & CODING_CATEGORY_MASK_UTF_8)
3668         mask |= detect_coding_utf_8 (src, src_end, multibytep);
3669       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3670         mask |= detect_coding_utf_16 (src, src_end, multibytep);
3671       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3672         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
3673       if (try & CODING_CATEGORY_MASK_CCL)
3674         mask |= detect_coding_ccl (src, src_end, multibytep);
3675     }
3676   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3677 }
3678
3679 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3680    The information of the detected coding system is set in CODING.  */
3681
3682 void
3683 detect_coding (coding, src, src_bytes)
3684      struct coding_system *coding;
3685      unsigned char *src;
3686      int src_bytes;
3687 {
3688   unsigned int idx;
3689   int skip, mask, i;
3690   Lisp_Object val;
3691
3692   val = Vcoding_category_list;
3693   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip, 0);
3694   coding->heading_ascii = skip;
3695
3696   if (!mask) return;
3697
3698   /* We found a single coding system of the highest priority in MASK.  */
3699   idx = 0;
3700   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3701   if (! mask)
3702     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3703
3704   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3705
3706   if (coding->eol_type != CODING_EOL_UNDECIDED)
3707     {
3708       Lisp_Object tmp;
3709
3710       tmp = Fget (val, Qeol_type);
3711       if (VECTORP (tmp))
3712         val = XVECTOR (tmp)->contents[coding->eol_type];
3713     }
3714
3715   /* Setup this new coding system while preserving some slots.  */
3716   {
3717     int src_multibyte = coding->src_multibyte;
3718     int dst_multibyte = coding->dst_multibyte;
3719
3720     setup_coding_system (val, coding);
3721     coding->src_multibyte = src_multibyte;
3722     coding->dst_multibyte = dst_multibyte;
3723     coding->heading_ascii = skip;
3724   }
3725 }
3726
3727 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3728    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3729    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3730
3731    How many non-eol characters are at the head is returned as *SKIP.  */
3732
3733 #define MAX_EOL_CHECK_COUNT 3
3734
3735 static int
3736 detect_eol_type (source, src_bytes, skip)
3737      unsigned char *source;
3738      int src_bytes, *skip;
3739 {
3740   unsigned char *src = source, *src_end = src + src_bytes;
3741   unsigned char c;
3742   int total = 0;                /* How many end-of-lines are found so far.  */
3743   int eol_type = CODING_EOL_UNDECIDED;
3744   int this_eol_type;
3745
3746   *skip = 0;
3747
3748   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3749     {
3750       c = *src++;
3751       if (c == '\n' || c == '\r')
3752         {
3753           if (*skip == 0)
3754             *skip = src - 1 - source;
3755           total++;
3756           if (c == '\n')
3757             this_eol_type = CODING_EOL_LF;
3758           else if (src >= src_end || *src != '\n')
3759             this_eol_type = CODING_EOL_CR;
3760           else
3761             this_eol_type = CODING_EOL_CRLF, src++;
3762
3763           if (eol_type == CODING_EOL_UNDECIDED)
3764             /* This is the first end-of-line.  */
3765             eol_type = this_eol_type;
3766           else if (eol_type != this_eol_type)
3767             {
3768               /* The found type is different from what found before.  */
3769               eol_type = CODING_EOL_INCONSISTENT;
3770               break;
3771             }
3772         }
3773     }
3774
3775   if (*skip == 0)
3776     *skip = src_end - source;
3777   return eol_type;
3778 }
3779
3780 /* Like detect_eol_type, but detect EOL type in 2-octet
3781    big-endian/little-endian format for coding systems utf-16-be and
3782    utf-16-le.  */
3783
3784 static int
3785 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3786      unsigned char *source;
3787      int src_bytes, *skip;
3788 {
3789   unsigned char *src = source, *src_end = src + src_bytes;
3790   unsigned int c1, c2;
3791   int total = 0;                /* How many end-of-lines are found so far.  */
3792   int eol_type = CODING_EOL_UNDECIDED;
3793   int this_eol_type;
3794   int msb, lsb;
3795
3796   if (big_endian_p)
3797     msb = 0, lsb = 1;
3798   else
3799     msb = 1, lsb = 0;
3800
3801   *skip = 0;
3802
3803   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3804     {
3805       c1 = (src[msb] << 8) | (src[lsb]);
3806       src += 2;
3807
3808       if (c1 == '\n' || c1 == '\r')
3809         {
3810           if (*skip == 0)
3811             *skip = src - 2 - source;
3812           total++;
3813           if (c1 == '\n')
3814             {
3815               this_eol_type = CODING_EOL_LF;
3816             }
3817           else
3818             {
3819               if ((src + 1) >= src_end)
3820                 {
3821                   this_eol_type = CODING_EOL_CR;
3822                 }
3823               else
3824                 {
3825                   c2 = (src[msb] << 8) | (src[lsb]);
3826                   if (c2 == '\n')
3827                     this_eol_type = CODING_EOL_CRLF, src += 2;
3828                   else
3829                     this_eol_type = CODING_EOL_CR;
3830                 }
3831             }
3832
3833           if (eol_type == CODING_EOL_UNDECIDED)
3834             /* This is the first end-of-line.  */
3835             eol_type = this_eol_type;
3836           else if (eol_type != this_eol_type)
3837             {
3838               /* The found type is different from what found before.  */
3839               eol_type = CODING_EOL_INCONSISTENT;
3840               break;
3841             }
3842         }
3843     }
3844
3845   if (*skip == 0)
3846     *skip = src_end - source;
3847   return eol_type;
3848 }
3849
3850 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3851    is encoded.  If it detects an appropriate format of end-of-line, it
3852    sets the information in *CODING.  */
3853
3854 void
3855 detect_eol (coding, src, src_bytes)
3856      struct coding_system *coding;
3857      unsigned char *src;
3858      int src_bytes;
3859 {
3860   Lisp_Object val;
3861   int skip;
3862   int eol_type;
3863
3864   switch (coding->category_idx)
3865     {
3866     case CODING_CATEGORY_IDX_UTF_16_BE:
3867       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3868       break;
3869     case CODING_CATEGORY_IDX_UTF_16_LE:
3870       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3871       break;
3872     default:
3873       eol_type = detect_eol_type (src, src_bytes, &skip);
3874       break;
3875     }
3876
3877   if (coding->heading_ascii > skip)
3878     coding->heading_ascii = skip;
3879   else
3880     skip = coding->heading_ascii;
3881
3882   if (eol_type == CODING_EOL_UNDECIDED)
3883     return;
3884   if (eol_type == CODING_EOL_INCONSISTENT)
3885     {
3886 #if 0
3887       /* This code is suppressed until we find a better way to
3888          distinguish raw text file and binary file.  */
3889
3890       /* If we have already detected that the coding is raw-text, the
3891          coding should actually be no-conversion.  */
3892       if (coding->type == coding_type_raw_text)
3893         {
3894           setup_coding_system (Qno_conversion, coding);
3895           return;
3896         }
3897       /* Else, let's decode only text code anyway.  */
3898 #endif /* 0 */
3899       eol_type = CODING_EOL_LF;
3900     }
3901
3902   val = Fget (coding->symbol, Qeol_type);
3903   if (VECTORP (val) && XVECTOR (val)->size == 3)
3904     {
3905       int src_multibyte = coding->src_multibyte;
3906       int dst_multibyte = coding->dst_multibyte;
3907
3908       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3909       coding->src_multibyte = src_multibyte;
3910       coding->dst_multibyte = dst_multibyte;
3911       coding->heading_ascii = skip;
3912     }
3913 }
3914
3915 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3916
3917 #define DECODING_BUFFER_MAG(coding)                     \
3918   (coding->type == coding_type_iso2022                  \
3919    ? 3                                                  \
3920    : (coding->type == coding_type_ccl                   \
3921       ? coding->spec.ccl.decoder.buf_magnification      \
3922       : 2))
3923
3924 /* Return maximum size (bytes) of a buffer enough for decoding
3925    SRC_BYTES of text encoded in CODING.  */
3926
3927 int
3928 decoding_buffer_size (coding, src_bytes)
3929      struct coding_system *coding;
3930      int src_bytes;
3931 {
3932   return (src_bytes * DECODING_BUFFER_MAG (coding)
3933           + CONVERSION_BUFFER_EXTRA_ROOM);
3934 }
3935
3936 /* Return maximum size (bytes) of a buffer enough for encoding
3937    SRC_BYTES of text to CODING.  */
3938
3939 int
3940 encoding_buffer_size (coding, src_bytes)
3941      struct coding_system *coding;
3942      int src_bytes;
3943 {
3944   int magnification;
3945
3946   if (coding->type == coding_type_ccl)
3947     magnification = coding->spec.ccl.encoder.buf_magnification;
3948   else if (CODING_REQUIRE_ENCODING (coding))
3949     magnification = 3;
3950   else
3951     magnification = 1;
3952
3953   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3954 }
3955
3956 /* Working buffer for code conversion.  */
3957 struct conversion_buffer
3958 {
3959   int size;                     /* size of data.  */
3960   int on_stack;                 /* 1 if allocated by alloca.  */
3961   unsigned char *data;
3962 };
3963
3964 /* Don't use alloca for allocating memory space larger than this, lest
3965    we overflow their stack.  */
3966 #define MAX_ALLOCA 16*1024
3967
3968 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
3969 #define allocate_conversion_buffer(buf, len)            \
3970   do {                                                  \
3971     if (len < MAX_ALLOCA)                               \
3972       {                                                 \
3973         buf.data = (unsigned char *) alloca (len);      \
3974         buf.on_stack = 1;                               \
3975       }                                                 \
3976     else                                                \
3977       {                                                 \
3978         buf.data = (unsigned char *) xmalloc (len);     \
3979         buf.on_stack = 0;                               \
3980       }                                                 \
3981     buf.size = len;                                     \
3982   } while (0)
3983
3984 /* Double the allocated memory for *BUF.  */
3985 static void
3986 extend_conversion_buffer (buf)
3987      struct conversion_buffer *buf;
3988 {
3989   if (buf->on_stack)
3990     {
3991       unsigned char *save = buf->data;
3992       buf->data = (unsigned char *) xmalloc (buf->size * 2);
3993       bcopy (save, buf->data, buf->size);
3994       buf->on_stack = 0;
3995     }
3996   else
3997     {
3998       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
3999     }
4000   buf->size *= 2;
4001 }
4002
4003 /* Free the allocated memory for BUF if it is not on stack.  */
4004 static void
4005 free_conversion_buffer (buf)
4006      struct conversion_buffer *buf;
4007 {
4008   if (!buf->on_stack)
4009     xfree (buf->data);
4010 }
4011
4012 int
4013 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4014      struct coding_system *coding;
4015      unsigned char *source, *destination;
4016      int src_bytes, dst_bytes, encodep;
4017 {
4018   struct ccl_program *ccl
4019     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4020   int result;
4021
4022   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4023   if (encodep)
4024     ccl->eol_type = coding->eol_type;
4025   ccl->multibyte = coding->src_multibyte;
4026   coding->produced = ccl_driver (ccl, source, destination,
4027                                  src_bytes, dst_bytes, &(coding->consumed));
4028   if (encodep)
4029     coding->produced_char = coding->produced;
4030   else
4031     {
4032       int bytes
4033         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4034       coding->produced = str_as_multibyte (destination, bytes,
4035                                            coding->produced,
4036                                            &(coding->produced_char));
4037     }
4038
4039   switch (ccl->status)
4040     {
4041     case CCL_STAT_SUSPEND_BY_SRC:
4042       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4043       break;
4044     case CCL_STAT_SUSPEND_BY_DST:
4045       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4046       break;
4047     case CCL_STAT_QUIT:
4048     case CCL_STAT_INVALID_CMD:
4049       coding->result = CODING_FINISH_INTERRUPT;
4050       break;
4051     default:
4052       coding->result = CODING_FINISH_NORMAL;
4053       break;
4054     }
4055   return coding->result;
4056 }
4057
4058 /* Decode EOL format of the text at PTR of BYTES length destructively
4059    according to CODING->eol_type.  This is called after the CCL
4060    program produced a decoded text at PTR.  If we do CRLF->LF
4061    conversion, update CODING->produced and CODING->produced_char.  */
4062
4063 static void
4064 decode_eol_post_ccl (coding, ptr, bytes)
4065      struct coding_system *coding;
4066      unsigned char *ptr;
4067      int bytes;
4068 {
4069   Lisp_Object val, saved_coding_symbol;
4070   unsigned char *pend = ptr + bytes;
4071   int dummy;
4072
4073   /* Remember the current coding system symbol.  We set it back when
4074      an inconsistent EOL is found so that `last-coding-system-used' is
4075      set to the coding system that doesn't specify EOL conversion.  */
4076   saved_coding_symbol = coding->symbol;
4077
4078   coding->spec.ccl.cr_carryover = 0;
4079   if (coding->eol_type == CODING_EOL_UNDECIDED)
4080     {
4081       /* Here, to avoid the call of setup_coding_system, we directly
4082          call detect_eol_type.  */
4083       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4084       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4085         coding->eol_type = CODING_EOL_LF;
4086       if (coding->eol_type != CODING_EOL_UNDECIDED)
4087         {
4088           val = Fget (coding->symbol, Qeol_type);
4089           if (VECTORP (val) && XVECTOR (val)->size == 3)
4090             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4091         }
4092       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4093     }
4094
4095   if (coding->eol_type == CODING_EOL_LF
4096       || coding->eol_type == CODING_EOL_UNDECIDED)
4097     {
4098       /* We have nothing to do.  */
4099       ptr = pend;
4100     }
4101   else if (coding->eol_type == CODING_EOL_CRLF)
4102     {
4103       unsigned char *pstart = ptr, *p = ptr;
4104
4105       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4106           && *(pend - 1) == '\r')
4107         {
4108           /* If the last character is CR, we can't handle it here
4109              because LF will be in the not-yet-decoded source text.
4110              Recorded that the CR is not yet processed.  */
4111           coding->spec.ccl.cr_carryover = 1;
4112           coding->produced--;
4113           coding->produced_char--;
4114           pend--;
4115         }
4116       while (ptr < pend)
4117         {
4118           if (*ptr == '\r')
4119             {
4120               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4121                 {
4122                   *p++ = '\n';
4123                   ptr += 2;
4124                 }
4125               else
4126                 {
4127                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4128                     goto undo_eol_conversion;
4129                   *p++ = *ptr++;
4130                 }
4131             }
4132           else if (*ptr == '\n'
4133                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4134             goto undo_eol_conversion;
4135           else
4136             *p++ = *ptr++;
4137           continue;
4138
4139         undo_eol_conversion:
4140           /* We have faced with inconsistent EOL format at PTR.
4141              Convert all LFs before PTR back to CRLFs.  */
4142           for (p--, ptr--; p >= pstart; p--)
4143             {
4144               if (*p == '\n')
4145                 *ptr-- = '\n', *ptr-- = '\r';
4146               else
4147                 *ptr-- = *p;
4148             }
4149           /*  If carryover is recorded, cancel it because we don't
4150               convert CRLF anymore.  */
4151           if (coding->spec.ccl.cr_carryover)
4152             {
4153               coding->spec.ccl.cr_carryover = 0;
4154               coding->produced++;
4155               coding->produced_char++;
4156               pend++;
4157             }
4158           p = ptr = pend;
4159           coding->eol_type = CODING_EOL_LF;
4160           coding->symbol = saved_coding_symbol;
4161         }
4162       if (p < pend)
4163         {
4164           /* As each two-byte sequence CRLF was converted to LF, (PEND
4165              - P) is the number of deleted characters.  */
4166           coding->produced -= pend - p;
4167           coding->produced_char -= pend - p;
4168         }
4169     }
4170   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4171     {
4172       unsigned char *p = ptr;
4173
4174       for (; ptr < pend; ptr++)
4175         {
4176           if (*ptr == '\r')
4177             *ptr = '\n';
4178           else if (*ptr == '\n'
4179                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4180             {
4181               for (; p < ptr; p++)
4182                 {
4183                   if (*p == '\n')
4184                     *p = '\r';
4185                 }
4186               ptr = pend;
4187               coding->eol_type = CODING_EOL_LF;
4188               coding->symbol = saved_coding_symbol;
4189             }
4190         }
4191     }
4192 }
4193
4194 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4195    decoding, it may detect coding system and format of end-of-line if
4196    those are not yet decided.  The source should be unibyte, the
4197    result is multibyte if CODING->dst_multibyte is nonzero, else
4198    unibyte.  */
4199
4200 int
4201 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4202      struct coding_system *coding;
4203      unsigned char *source, *destination;
4204      int src_bytes, dst_bytes;
4205 {
4206   if (coding->type == coding_type_undecided)
4207     detect_coding (coding, source, src_bytes);
4208
4209   if (coding->eol_type == CODING_EOL_UNDECIDED
4210       && coding->type != coding_type_ccl)
4211     detect_eol (coding, source, src_bytes);
4212
4213   coding->produced = coding->produced_char = 0;
4214   coding->consumed = coding->consumed_char = 0;
4215   coding->errors = 0;
4216   coding->result = CODING_FINISH_NORMAL;
4217
4218   switch (coding->type)
4219     {
4220     case coding_type_sjis:
4221       decode_coding_sjis_big5 (coding, source, destination,
4222                                src_bytes, dst_bytes, 1);
4223       break;
4224
4225     case coding_type_iso2022:
4226       decode_coding_iso2022 (coding, source, destination,
4227                              src_bytes, dst_bytes);
4228       break;
4229
4230     case coding_type_big5:
4231       decode_coding_sjis_big5 (coding, source, destination,
4232                                src_bytes, dst_bytes, 0);
4233       break;
4234
4235     case coding_type_emacs_mule:
4236       decode_coding_emacs_mule (coding, source, destination,
4237                                 src_bytes, dst_bytes);
4238       break;
4239
4240     case coding_type_ccl:
4241       if (coding->spec.ccl.cr_carryover)
4242         {
4243           /* Set the CR which is not processed by the previous call of
4244              decode_eol_post_ccl in DESTINATION.  */
4245           *destination = '\r';
4246           coding->produced++;
4247           coding->produced_char++;
4248           dst_bytes--;
4249         }
4250       ccl_coding_driver (coding, source,
4251                          destination + coding->spec.ccl.cr_carryover,
4252                          src_bytes, dst_bytes, 0);
4253       if (coding->eol_type != CODING_EOL_LF)
4254         decode_eol_post_ccl (coding, destination, coding->produced);
4255       break;
4256
4257     default:
4258       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4259     }
4260
4261   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4262       && coding->mode & CODING_MODE_LAST_BLOCK
4263       && coding->consumed == src_bytes)
4264     coding->result = CODING_FINISH_NORMAL;
4265
4266   if (coding->mode & CODING_MODE_LAST_BLOCK
4267       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4268     {
4269       unsigned char *src = source + coding->consumed;
4270       unsigned char *dst = destination + coding->produced;
4271
4272       src_bytes -= coding->consumed;
4273       coding->errors++;
4274       if (COMPOSING_P (coding))
4275         DECODE_COMPOSITION_END ('1');
4276       while (src_bytes--)
4277         {
4278           int c = *src++;
4279           dst += CHAR_STRING (c, dst);
4280           coding->produced_char++;
4281         }
4282       coding->consumed = coding->consumed_char = src - source;
4283       coding->produced = dst - destination;
4284       coding->result = CODING_FINISH_NORMAL;
4285     }
4286
4287   if (!coding->dst_multibyte)
4288     {
4289       coding->produced = str_as_unibyte (destination, coding->produced);
4290       coding->produced_char = coding->produced;
4291     }
4292
4293   return coding->result;
4294 }
4295
4296 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4297    multibyteness of the source is CODING->src_multibyte, the
4298    multibyteness of the result is always unibyte.  */
4299
4300 int
4301 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4302      struct coding_system *coding;
4303      unsigned char *source, *destination;
4304      int src_bytes, dst_bytes;
4305 {
4306   coding->produced = coding->produced_char = 0;
4307   coding->consumed = coding->consumed_char = 0;
4308   coding->errors = 0;
4309   coding->result = CODING_FINISH_NORMAL;
4310
4311   switch (coding->type)
4312     {
4313     case coding_type_sjis:
4314       encode_coding_sjis_big5 (coding, source, destination,
4315                                src_bytes, dst_bytes, 1);
4316       break;
4317
4318     case coding_type_iso2022:
4319       encode_coding_iso2022 (coding, source, destination,
4320                              src_bytes, dst_bytes);
4321       break;
4322
4323     case coding_type_big5:
4324       encode_coding_sjis_big5 (coding, source, destination,
4325                                src_bytes, dst_bytes, 0);
4326       break;
4327
4328     case coding_type_emacs_mule:
4329       encode_coding_emacs_mule (coding, source, destination,
4330                                 src_bytes, dst_bytes);
4331       break;
4332
4333     case coding_type_ccl:
4334       ccl_coding_driver (coding, source, destination,
4335                          src_bytes, dst_bytes, 1);
4336       break;
4337
4338     default:
4339       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4340     }
4341
4342   if (coding->mode & CODING_MODE_LAST_BLOCK
4343       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4344     {
4345       unsigned char *src = source + coding->consumed;
4346       unsigned char *src_end = src + src_bytes;
4347       unsigned char *dst = destination + coding->produced;
4348
4349       if (coding->type == coding_type_iso2022)
4350         ENCODE_RESET_PLANE_AND_REGISTER;
4351       if (COMPOSING_P (coding))
4352         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4353       if (coding->consumed < src_bytes)
4354         {
4355           int len = src_bytes - coding->consumed;
4356
4357           BCOPY_SHORT (source + coding->consumed, dst, len);
4358           if (coding->src_multibyte)
4359             len = str_as_unibyte (dst, len);
4360           dst += len;
4361           coding->consumed = src_bytes;
4362         }
4363       coding->produced = coding->produced_char = dst - destination;
4364       coding->result = CODING_FINISH_NORMAL;
4365     }
4366
4367   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4368       && coding->consumed == src_bytes)
4369     coding->result = CODING_FINISH_NORMAL;
4370
4371   return coding->result;
4372 }
4373
4374 /* Scan text in the region between *BEG and *END (byte positions),
4375    skip characters which we don't have to decode by coding system
4376    CODING at the head and tail, then set *BEG and *END to the region
4377    of the text we actually have to convert.  The caller should move
4378    the gap out of the region in advance if the region is from a
4379    buffer.
4380
4381    If STR is not NULL, *BEG and *END are indices into STR.  */
4382
4383 static void
4384 shrink_decoding_region (beg, end, coding, str)
4385      int *beg, *end;
4386      struct coding_system *coding;
4387      unsigned char *str;
4388 {
4389   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4390   int eol_conversion;
4391   Lisp_Object translation_table;
4392
4393   if (coding->type == coding_type_ccl
4394       || coding->type == coding_type_undecided
4395       || coding->eol_type != CODING_EOL_LF
4396       || !NILP (coding->post_read_conversion)
4397       || coding->composing != COMPOSITION_DISABLED)
4398     {
4399       /* We can't skip any data.  */
4400       return;
4401     }
4402   if (coding->type == coding_type_no_conversion
4403       || coding->type == coding_type_raw_text
4404       || coding->type == coding_type_emacs_mule)
4405     {
4406       /* We need no conversion, but don't have to skip any data here.
4407          Decoding routine handles them effectively anyway.  */
4408       return;
4409     }
4410
4411   translation_table = coding->translation_table_for_decode;
4412   if (NILP (translation_table) && !NILP (Venable_character_translation))
4413     translation_table = Vstandard_translation_table_for_decode;
4414   if (CHAR_TABLE_P (translation_table))
4415     {
4416       int i;
4417       for (i = 0; i < 128; i++)
4418         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4419           break;
4420       if (i < 128)
4421         /* Some ASCII character should be translated.  We give up
4422            shrinking.  */
4423         return;
4424     }
4425
4426   if (coding->heading_ascii >= 0)
4427     /* Detection routine has already found how much we can skip at the
4428        head.  */
4429     *beg += coding->heading_ascii;
4430
4431   if (str)
4432     {
4433       begp_orig = begp = str + *beg;
4434       endp_orig = endp = str + *end;
4435     }
4436   else
4437     {
4438       begp_orig = begp = BYTE_POS_ADDR (*beg);
4439       endp_orig = endp = begp + *end - *beg;
4440     }
4441
4442   eol_conversion = (coding->eol_type == CODING_EOL_CR
4443                     || coding->eol_type == CODING_EOL_CRLF);
4444
4445   switch (coding->type)
4446     {
4447     case coding_type_sjis:
4448     case coding_type_big5:
4449       /* We can skip all ASCII characters at the head.  */
4450       if (coding->heading_ascii < 0)
4451         {
4452           if (eol_conversion)
4453             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4454           else
4455             while (begp < endp && *begp < 0x80) begp++;
4456         }
4457       /* We can skip all ASCII characters at the tail except for the
4458          second byte of SJIS or BIG5 code.  */
4459       if (eol_conversion)
4460         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4461       else
4462         while (begp < endp && endp[-1] < 0x80) endp--;
4463       /* Do not consider LF as ascii if preceded by CR, since that
4464          confuses eol decoding. */
4465       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4466         endp++;
4467       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4468         endp++;
4469       break;
4470
4471     case coding_type_iso2022:
4472       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4473         /* We can't skip any data.  */
4474         break;
4475       if (coding->heading_ascii < 0)
4476         {
4477           /* We can skip all ASCII characters at the head except for a
4478              few control codes.  */
4479           while (begp < endp && (c = *begp) < 0x80
4480                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4481                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4482                  && (!eol_conversion || c != ISO_CODE_LF))
4483             begp++;
4484         }
4485       switch (coding->category_idx)
4486         {
4487         case CODING_CATEGORY_IDX_ISO_8_1:
4488         case CODING_CATEGORY_IDX_ISO_8_2:
4489           /* We can skip all ASCII characters at the tail.  */
4490           if (eol_conversion)
4491             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4492           else
4493             while (begp < endp && endp[-1] < 0x80) endp--;
4494           /* Do not consider LF as ascii if preceded by CR, since that
4495              confuses eol decoding. */
4496           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4497             endp++;
4498           break;
4499
4500         case CODING_CATEGORY_IDX_ISO_7:
4501         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4502           {
4503             /* We can skip all charactes at the tail except for 8-bit
4504                codes and ESC and the following 2-byte at the tail.  */
4505             unsigned char *eight_bit = NULL;
4506
4507             if (eol_conversion)
4508               while (begp < endp
4509                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4510                 {
4511                   if (!eight_bit && c & 0x80) eight_bit = endp;
4512                   endp--;
4513                 }
4514             else
4515               while (begp < endp
4516                      && (c = endp[-1]) != ISO_CODE_ESC)
4517                 {
4518                   if (!eight_bit && c & 0x80) eight_bit = endp;
4519                   endp--;
4520                 }
4521             /* Do not consider LF as ascii if preceded by CR, since that
4522                confuses eol decoding. */
4523             if (begp < endp && endp < endp_orig
4524                 && endp[-1] == '\r' && endp[0] == '\n')
4525               endp++;
4526             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4527               {
4528                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4529                   /* This is an ASCII designation sequence.  We can
4530                      surely skip the tail.  But, if we have
4531                      encountered an 8-bit code, skip only the codes
4532                      after that.  */
4533                   endp = eight_bit ? eight_bit : endp + 2;
4534                 else
4535                   /* Hmmm, we can't skip the tail.  */
4536                   endp = endp_orig;
4537               }
4538             else if (eight_bit)
4539               endp = eight_bit;
4540           }
4541         }
4542       break;
4543
4544     default:
4545       abort ();
4546     }
4547   *beg += begp - begp_orig;
4548   *end += endp - endp_orig;
4549   return;
4550 }
4551
4552 /* Like shrink_decoding_region but for encoding.  */
4553
4554 static void
4555 shrink_encoding_region (beg, end, coding, str)
4556      int *beg, *end;
4557      struct coding_system *coding;
4558      unsigned char *str;
4559 {
4560   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4561   int eol_conversion;
4562   Lisp_Object translation_table;
4563
4564   if (coding->type == coding_type_ccl
4565       || coding->eol_type == CODING_EOL_CRLF
4566       || coding->eol_type == CODING_EOL_CR
4567       || coding->cmp_data && coding->cmp_data->used > 0)
4568     {
4569       /* We can't skip any data.  */
4570       return;
4571     }
4572   if (coding->type == coding_type_no_conversion
4573       || coding->type == coding_type_raw_text
4574       || coding->type == coding_type_emacs_mule
4575       || coding->type == coding_type_undecided)
4576     {
4577       /* We need no conversion, but don't have to skip any data here.
4578          Encoding routine handles them effectively anyway.  */
4579       return;
4580     }
4581
4582   translation_table = coding->translation_table_for_encode;
4583   if (NILP (translation_table) && !NILP (Venable_character_translation))
4584     translation_table = Vstandard_translation_table_for_encode;
4585   if (CHAR_TABLE_P (translation_table))
4586     {
4587       int i;
4588       for (i = 0; i < 128; i++)
4589         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4590           break;
4591       if (i < 128)
4592         /* Some ASCII character should be tranlsated.  We give up
4593            shrinking.  */
4594         return;
4595     }
4596
4597   if (str)
4598     {
4599       begp_orig = begp = str + *beg;
4600       endp_orig = endp = str + *end;
4601     }
4602   else
4603     {
4604       begp_orig = begp = BYTE_POS_ADDR (*beg);
4605       endp_orig = endp = begp + *end - *beg;
4606     }
4607
4608   eol_conversion = (coding->eol_type == CODING_EOL_CR
4609                     || coding->eol_type == CODING_EOL_CRLF);
4610
4611   /* Here, we don't have to check coding->pre_write_conversion because
4612      the caller is expected to have handled it already.  */
4613   switch (coding->type)
4614     {
4615     case coding_type_iso2022:
4616       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4617         /* We can't skip any data.  */
4618         break;
4619       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4620         {
4621           unsigned char *bol = begp;
4622           while (begp < endp && *begp < 0x80)
4623             {
4624               begp++;
4625               if (begp[-1] == '\n')
4626                 bol = begp;
4627             }
4628           begp = bol;
4629           goto label_skip_tail;
4630         }
4631       /* fall down ... */
4632
4633     case coding_type_sjis:
4634     case coding_type_big5:
4635       /* We can skip all ASCII characters at the head and tail.  */
4636       if (eol_conversion)
4637         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4638       else
4639         while (begp < endp && *begp < 0x80) begp++;
4640     label_skip_tail:
4641       if (eol_conversion)
4642         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4643       else
4644         while (begp < endp && *(endp - 1) < 0x80) endp--;
4645       break;
4646
4647     default:
4648       abort ();
4649     }
4650
4651   *beg += begp - begp_orig;
4652   *end += endp - endp_orig;
4653   return;
4654 }
4655
4656 /* As shrinking conversion region requires some overhead, we don't try
4657    shrinking if the length of conversion region is less than this
4658    value.  */
4659 static int shrink_conversion_region_threshhold = 1024;
4660
4661 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4662   do {                                                                  \
4663     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4664       {                                                                 \
4665         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4666         else shrink_decoding_region (beg, end, coding, str);            \
4667       }                                                                 \
4668   } while (0)
4669
4670 static Lisp_Object
4671 code_convert_region_unwind (dummy)
4672      Lisp_Object dummy;
4673 {
4674   inhibit_pre_post_conversion = 0;
4675   return Qnil;
4676 }
4677
4678 /* Store information about all compositions in the range FROM and TO
4679    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4680    buffer or a string, defaults to the current buffer.  */
4681
4682 void
4683 coding_save_composition (coding, from, to, obj)
4684      struct coding_system *coding;
4685      int from, to;
4686      Lisp_Object obj;
4687 {
4688   Lisp_Object prop;
4689   int start, end;
4690
4691   if (coding->composing == COMPOSITION_DISABLED)
4692     return;
4693   if (!coding->cmp_data)
4694     coding_allocate_composition_data (coding, from);
4695   if (!find_composition (from, to, &start, &end, &prop, obj)
4696       || end > to)
4697     return;
4698   if (start < from
4699       && (!find_composition (end, to, &start, &end, &prop, obj)
4700           || end > to))
4701     return;
4702   coding->composing = COMPOSITION_NO;
4703   do
4704     {
4705       if (COMPOSITION_VALID_P (start, end, prop))
4706         {
4707           enum composition_method method = COMPOSITION_METHOD (prop);
4708           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4709               >= COMPOSITION_DATA_SIZE)
4710             coding_allocate_composition_data (coding, from);
4711           /* For relative composition, we remember start and end
4712              positions, for the other compositions, we also remember
4713              components.  */
4714           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4715           if (method != COMPOSITION_RELATIVE)
4716             {
4717               /* We must store a*/
4718               Lisp_Object val, ch;
4719
4720               val = COMPOSITION_COMPONENTS (prop);
4721               if (CONSP (val))
4722                 while (CONSP (val))
4723                   {
4724                     ch = XCAR (val), val = XCDR (val);
4725                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4726                   }
4727               else if (VECTORP (val) || STRINGP (val))
4728                 {
4729                   int len = (VECTORP (val)
4730                              ? XVECTOR (val)->size : XSTRING (val)->size);
4731                   int i;
4732                   for (i = 0; i < len; i++)
4733                     {
4734                       ch = (STRINGP (val)
4735                             ? Faref (val, make_number (i))
4736                             : XVECTOR (val)->contents[i]);
4737                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4738                     }
4739                 }
4740               else              /* INTEGERP (val) */
4741                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4742             }
4743           CODING_ADD_COMPOSITION_END (coding, end - from);
4744         }
4745       start = end;
4746     }
4747   while (start < to
4748          && find_composition (start, to, &start, &end, &prop, obj)
4749          && end <= to);
4750
4751   /* Make coding->cmp_data point to the first memory block.  */
4752   while (coding->cmp_data->prev)
4753     coding->cmp_data = coding->cmp_data->prev;
4754   coding->cmp_data_start = 0;
4755 }
4756
4757 /* Reflect the saved information about compositions to OBJ.
4758    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4759    is a buffer or a string, defaults to the current buffer.  */
4760
4761 void
4762 coding_restore_composition (coding, obj)
4763      struct coding_system *coding;
4764      Lisp_Object obj;
4765 {
4766   struct composition_data *cmp_data = coding->cmp_data;
4767
4768   if (!cmp_data)
4769     return;
4770
4771   while (cmp_data->prev)
4772     cmp_data = cmp_data->prev;
4773
4774   while (cmp_data)
4775     {
4776       int i;
4777
4778       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
4779            i += cmp_data->data[i])
4780         {
4781           int *data = cmp_data->data + i;
4782           enum composition_method method = (enum composition_method) data[3];
4783           Lisp_Object components;
4784
4785           if (method == COMPOSITION_RELATIVE)
4786             components = Qnil;
4787           else
4788             {
4789               int len = data[0] - 4, j;
4790               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4791
4792               for (j = 0; j < len; j++)
4793                 args[j] = make_number (data[4 + j]);
4794               components = (method == COMPOSITION_WITH_ALTCHARS
4795                             ? Fstring (len, args) : Fvector (len, args));
4796             }
4797           compose_text (data[1], data[2], components, Qnil, obj);
4798         }
4799       cmp_data = cmp_data->next;
4800     }
4801 }
4802
4803 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4804    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4805    coding system CODING, and return the status code of code conversion
4806    (currently, this value has no meaning).
4807
4808    How many characters (and bytes) are converted to how many
4809    characters (and bytes) are recorded in members of the structure
4810    CODING.
4811
4812    If REPLACE is nonzero, we do various things as if the original text
4813    is deleted and a new text is inserted.  See the comments in
4814    replace_range (insdel.c) to know what we are doing.
4815
4816    If REPLACE is zero, it is assumed that the source text is unibyte.
4817    Otherwize, it is assumed that the source text is multibyte.  */
4818
4819 int
4820 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4821      int from, from_byte, to, to_byte, encodep, replace;
4822      struct coding_system *coding;
4823 {
4824   int len = to - from, len_byte = to_byte - from_byte;
4825   int require, inserted, inserted_byte;
4826   int head_skip, tail_skip, total_skip = 0;
4827   Lisp_Object saved_coding_symbol;
4828   int first = 1;
4829   unsigned char *src, *dst;
4830   Lisp_Object deletion;
4831   int orig_point = PT, orig_len = len;
4832   int prev_Z;
4833   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4834
4835   coding->src_multibyte = replace && multibyte_p;
4836   coding->dst_multibyte = multibyte_p;
4837
4838   deletion = Qnil;
4839   saved_coding_symbol = Qnil;
4840
4841   if (from < PT && PT < to)
4842     {
4843       TEMP_SET_PT_BOTH (from, from_byte);
4844       orig_point = from;
4845     }
4846
4847   if (replace)
4848     {
4849       int saved_from = from;
4850       int saved_inhibit_modification_hooks;
4851
4852       prepare_to_modify_buffer (from, to, &from);
4853       if (saved_from != from)
4854         {
4855           to = from + len;
4856           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4857           len_byte = to_byte - from_byte;
4858         }
4859
4860       /* The code conversion routine can not preserve text properties
4861          for now.  So, we must remove all text properties in the
4862          region.  Here, we must suppress all modification hooks.  */
4863       saved_inhibit_modification_hooks = inhibit_modification_hooks;
4864       inhibit_modification_hooks = 1;
4865       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4866       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4867     }
4868
4869   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4870     {
4871       /* We must detect encoding of text and eol format.  */
4872
4873       if (from < GPT && to > GPT)
4874         move_gap_both (from, from_byte);
4875       if (coding->type == coding_type_undecided)
4876         {
4877           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4878           if (coding->type == coding_type_undecided)
4879             {
4880               /* It seems that the text contains only ASCII, but we
4881                  should not leave it undecided because the deeper
4882                  decoding routine (decode_coding) tries to detect the
4883                  encodings again in vain.  */
4884               coding->type = coding_type_emacs_mule;
4885               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
4886             }
4887         }
4888       if (coding->eol_type == CODING_EOL_UNDECIDED
4889           && coding->type != coding_type_ccl)
4890         {
4891           saved_coding_symbol = coding->symbol;
4892           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4893           if (coding->eol_type == CODING_EOL_UNDECIDED)
4894             coding->eol_type = CODING_EOL_LF;
4895           /* We had better recover the original eol format if we
4896              encounter an inconsitent eol format while decoding.  */
4897           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4898         }
4899     }
4900
4901   /* Now we convert the text.  */
4902
4903   /* For encoding, we must process pre-write-conversion in advance.  */
4904   if (! inhibit_pre_post_conversion
4905       && encodep
4906       && SYMBOLP (coding->pre_write_conversion)
4907       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4908     {
4909       /* The function in pre-write-conversion may put a new text in a
4910          new buffer.  */
4911       struct buffer *prev = current_buffer;
4912       Lisp_Object new;
4913       int count = specpdl_ptr - specpdl;
4914
4915       record_unwind_protect (code_convert_region_unwind, Qnil);
4916       /* We should not call any more pre-write/post-read-conversion
4917          functions while this pre-write-conversion is running.  */
4918       inhibit_pre_post_conversion = 1;
4919       call2 (coding->pre_write_conversion,
4920              make_number (from), make_number (to));
4921       inhibit_pre_post_conversion = 0;
4922       /* Discard the unwind protect.  */
4923       specpdl_ptr--;
4924
4925       if (current_buffer != prev)
4926         {
4927           len = ZV - BEGV;
4928           new = Fcurrent_buffer ();
4929           set_buffer_internal_1 (prev);
4930           del_range_2 (from, from_byte, to, to_byte, 0);
4931           TEMP_SET_PT_BOTH (from, from_byte);
4932           insert_from_buffer (XBUFFER (new), 1, len, 0);
4933           Fkill_buffer (new);
4934           if (orig_point >= to)
4935             orig_point += len - orig_len;
4936           else if (orig_point > from)
4937             orig_point = from;
4938           orig_len = len;
4939           to = from + len;
4940           from_byte = CHAR_TO_BYTE (from);
4941           to_byte = CHAR_TO_BYTE (to);
4942           len_byte = to_byte - from_byte;
4943           TEMP_SET_PT_BOTH (from, from_byte);
4944         }
4945     }
4946
4947   if (replace)
4948     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4949
4950   if (coding->composing != COMPOSITION_DISABLED)
4951     {
4952       if (encodep)
4953         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4954       else
4955         coding_allocate_composition_data (coding, from);
4956     }
4957
4958   /* Try to skip the heading and tailing ASCIIs.  */
4959   if (coding->type != coding_type_ccl)
4960     {
4961       int from_byte_orig = from_byte, to_byte_orig = to_byte;
4962
4963       if (from < GPT && GPT < to)
4964         move_gap_both (from, from_byte);
4965       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4966       if (from_byte == to_byte
4967           && (encodep || NILP (coding->post_read_conversion))
4968           && ! CODING_REQUIRE_FLUSHING (coding))
4969         {
4970           coding->produced = len_byte;
4971           coding->produced_char = len;
4972           if (!replace)
4973             /* We must record and adjust for this new text now.  */
4974             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4975           return 0;
4976         }
4977
4978       head_skip = from_byte - from_byte_orig;
4979       tail_skip = to_byte_orig - to_byte;
4980       total_skip = head_skip + tail_skip;
4981       from += head_skip;
4982       to -= tail_skip;
4983       len -= total_skip; len_byte -= total_skip;
4984     }
4985
4986   /* For converion, we must put the gap before the text in addition to
4987      making the gap larger for efficient decoding.  The required gap
4988      size starts from 2000 which is the magic number used in make_gap.
4989      But, after one batch of conversion, it will be incremented if we
4990      find that it is not enough .  */
4991   require = 2000;
4992
4993   if (GAP_SIZE  < require)
4994     make_gap (require - GAP_SIZE);
4995   move_gap_both (from, from_byte);
4996
4997   inserted = inserted_byte = 0;
4998
4999   GAP_SIZE += len_byte;
5000   ZV -= len;
5001   Z -= len;
5002   ZV_BYTE -= len_byte;
5003   Z_BYTE -= len_byte;
5004
5005   if (GPT - BEG < BEG_UNCHANGED)
5006     BEG_UNCHANGED = GPT - BEG;
5007   if (Z - GPT < END_UNCHANGED)
5008     END_UNCHANGED = Z - GPT;
5009
5010   if (!encodep && coding->src_multibyte)
5011     {
5012       /* Decoding routines expects that the source text is unibyte.
5013          We must convert 8-bit characters of multibyte form to
5014          unibyte.  */
5015       int len_byte_orig = len_byte;
5016       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5017       if (len_byte < len_byte_orig)
5018         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5019                     len_byte);
5020       coding->src_multibyte = 0;
5021     }
5022
5023   for (;;)
5024     {
5025       int result;
5026
5027       /* The buffer memory is now:
5028          +--------+converted-text+---------+-------original-text-------+---+
5029          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5030                   |<---------------------- GAP ----------------------->|  */
5031       src = GAP_END_ADDR - len_byte;
5032       dst = GPT_ADDR + inserted_byte;
5033
5034       if (encodep)
5035         result = encode_coding (coding, src, dst, len_byte, 0);
5036       else
5037         result = decode_coding (coding, src, dst, len_byte, 0);
5038
5039       /* The buffer memory is now:
5040          +--------+-------converted-text----+--+------original-text----+---+
5041          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5042                   |<---------------------- GAP ----------------------->|  */
5043
5044       inserted += coding->produced_char;
5045       inserted_byte += coding->produced;
5046       len_byte -= coding->consumed;
5047
5048       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5049         {
5050           coding_allocate_composition_data (coding, from + inserted);
5051           continue;
5052         }
5053
5054       src += coding->consumed;
5055       dst += coding->produced;
5056
5057       if (result == CODING_FINISH_NORMAL)
5058         {
5059           src += len_byte;
5060           break;
5061         }
5062       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5063         {
5064           unsigned char *pend = dst, *p = pend - inserted_byte;
5065           Lisp_Object eol_type;
5066
5067           /* Encode LFs back to the original eol format (CR or CRLF).  */
5068           if (coding->eol_type == CODING_EOL_CR)
5069             {
5070               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5071             }
5072           else
5073             {
5074               int count = 0;
5075
5076               while (p < pend) if (*p++ == '\n') count++;
5077               if (src - dst < count)
5078                 {
5079                   /* We don't have sufficient room for encoding LFs
5080                      back to CRLF.  We must record converted and
5081                      not-yet-converted text back to the buffer
5082                      content, enlarge the gap, then record them out of
5083                      the buffer contents again.  */
5084                   int add = len_byte + inserted_byte;
5085
5086                   GAP_SIZE -= add;
5087                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5088                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5089                   make_gap (count - GAP_SIZE);
5090                   GAP_SIZE += add;
5091                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5092                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5093                   /* Don't forget to update SRC, DST, and PEND.  */
5094                   src = GAP_END_ADDR - len_byte;
5095                   dst = GPT_ADDR + inserted_byte;
5096                   pend = dst;
5097                 }
5098               inserted += count;
5099               inserted_byte += count;
5100               coding->produced += count;
5101               p = dst = pend + count;
5102               while (count)
5103                 {
5104                   *--p = *--pend;
5105                   if (*p == '\n') count--, *--p = '\r';
5106                 }
5107             }
5108
5109           /* Suppress eol-format conversion in the further conversion.  */
5110           coding->eol_type = CODING_EOL_LF;
5111
5112           /* Set the coding system symbol to that for Unix-like EOL.  */
5113           eol_type = Fget (saved_coding_symbol, Qeol_type);
5114           if (VECTORP (eol_type)
5115               && XVECTOR (eol_type)->size == 3
5116               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5117             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5118           else
5119             coding->symbol = saved_coding_symbol;
5120
5121           continue;
5122         }
5123       if (len_byte <= 0)
5124         {
5125           if (coding->type != coding_type_ccl
5126               || coding->mode & CODING_MODE_LAST_BLOCK)
5127             break;
5128           coding->mode |= CODING_MODE_LAST_BLOCK;
5129           continue;
5130         }
5131       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5132         {
5133           /* The source text ends in invalid codes.  Let's just
5134              make them valid buffer contents, and finish conversion.  */
5135           inserted += len_byte;
5136           inserted_byte += len_byte;
5137           while (len_byte--)
5138             *dst++ = *src++;
5139           break;
5140         }
5141       if (result == CODING_FINISH_INTERRUPT)
5142         {
5143           /* The conversion procedure was interrupted by a user.  */
5144           break;
5145         }
5146       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5147       if (coding->consumed < 1)
5148         {
5149           /* It's quite strange to require more memory without
5150              consuming any bytes.  Perhaps CCL program bug.  */
5151           break;
5152         }
5153       if (first)
5154         {
5155           /* We have just done the first batch of conversion which was
5156              stoped because of insufficient gap.  Let's reconsider the
5157              required gap size (i.e. SRT - DST) now.
5158
5159              We have converted ORIG bytes (== coding->consumed) into
5160              NEW bytes (coding->produced).  To convert the remaining
5161              LEN bytes, we may need REQUIRE bytes of gap, where:
5162                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5163                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5164              Here, we are sure that NEW >= ORIG.  */
5165           float ratio = coding->produced - coding->consumed;
5166           ratio /= coding->consumed;
5167           require = len_byte * ratio;
5168           first = 0;
5169         }
5170       if ((src - dst) < (require + 2000))
5171         {
5172           /* See the comment above the previous call of make_gap.  */
5173           int add = len_byte + inserted_byte;
5174
5175           GAP_SIZE -= add;
5176           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5177           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5178           make_gap (require + 2000);
5179           GAP_SIZE += add;
5180           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5181           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5182         }
5183     }
5184   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5185
5186   if (encodep && coding->dst_multibyte)
5187     {
5188       /* The output is unibyte.  We must convert 8-bit characters to
5189          multibyte form.  */
5190       if (inserted_byte * 2 > GAP_SIZE)
5191         {
5192           GAP_SIZE -= inserted_byte;
5193           ZV += inserted_byte; Z += inserted_byte;
5194           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5195           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5196           make_gap (inserted_byte - GAP_SIZE);
5197           GAP_SIZE += inserted_byte;
5198           ZV -= inserted_byte; Z -= inserted_byte;
5199           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5200           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5201         }
5202       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5203     }
5204
5205   /* If we have shrinked the conversion area, adjust it now.  */
5206   if (total_skip > 0)
5207     {
5208       if (tail_skip > 0)
5209         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5210       inserted += total_skip; inserted_byte += total_skip;
5211       GAP_SIZE += total_skip;
5212       GPT -= head_skip; GPT_BYTE -= head_skip;
5213       ZV -= total_skip; ZV_BYTE -= total_skip;
5214       Z -= total_skip; Z_BYTE -= total_skip;
5215       from -= head_skip; from_byte -= head_skip;
5216       to += tail_skip; to_byte += tail_skip;
5217     }
5218
5219   prev_Z = Z;
5220   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5221   inserted = Z - prev_Z;
5222
5223   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5224     coding_restore_composition (coding, Fcurrent_buffer ());
5225   coding_free_composition_data (coding);
5226
5227   if (! inhibit_pre_post_conversion
5228       && ! encodep && ! NILP (coding->post_read_conversion))
5229     {
5230       Lisp_Object val;
5231       int count = specpdl_ptr - specpdl;
5232
5233       if (from != PT)
5234         TEMP_SET_PT_BOTH (from, from_byte);
5235       prev_Z = Z;
5236       record_unwind_protect (code_convert_region_unwind, Qnil);
5237       /* We should not call any more pre-write/post-read-conversion
5238          functions while this post-read-conversion is running.  */
5239       inhibit_pre_post_conversion = 1;
5240       val = call1 (coding->post_read_conversion, make_number (inserted));
5241       inhibit_pre_post_conversion = 0;
5242       /* Discard the unwind protect.  */
5243       specpdl_ptr--;
5244       CHECK_NUMBER (val, 0);
5245       inserted += Z - prev_Z;
5246     }
5247
5248   if (orig_point >= from)
5249     {
5250       if (orig_point >= from + orig_len)
5251         orig_point += inserted - orig_len;
5252       else
5253         orig_point = from;
5254       TEMP_SET_PT (orig_point);
5255     }
5256
5257   if (replace)
5258     {
5259       signal_after_change (from, to - from, inserted);
5260       update_compositions (from, from + inserted, CHECK_BORDER);
5261     }
5262
5263   {
5264     coding->consumed = to_byte - from_byte;
5265     coding->consumed_char = to - from;
5266     coding->produced = inserted_byte;
5267     coding->produced_char = inserted;
5268   }
5269
5270   return 0;
5271 }
5272
5273 Lisp_Object
5274 run_pre_post_conversion_on_str (str, coding, encodep)
5275      Lisp_Object str;
5276      struct coding_system *coding;
5277      int encodep;
5278 {
5279   int count = specpdl_ptr - specpdl;
5280   struct gcpro gcpro1;
5281   struct buffer *prev = current_buffer;
5282   int multibyte = STRING_MULTIBYTE (str);
5283
5284   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5285   record_unwind_protect (code_convert_region_unwind, Qnil);
5286   GCPRO1 (str);
5287   temp_output_buffer_setup (" *code-converting-work*");
5288   set_buffer_internal (XBUFFER (Vstandard_output));
5289   /* We must insert the contents of STR as is without
5290      unibyte<->multibyte conversion.  For that, we adjust the
5291      multibyteness of the working buffer to that of STR.  */
5292   Ferase_buffer ();
5293   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5294   insert_from_string (str, 0, 0,
5295                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5296   UNGCPRO;
5297   inhibit_pre_post_conversion = 1;
5298   if (encodep)
5299     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5300   else
5301     {
5302       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5303       call1 (coding->post_read_conversion, make_number (Z - BEG));
5304     }
5305   inhibit_pre_post_conversion = 0;
5306   str = make_buffer_string (BEG, Z, 1);
5307   return unbind_to (count, str);
5308 }
5309
5310 Lisp_Object
5311 decode_coding_string (str, coding, nocopy)
5312      Lisp_Object str;
5313      struct coding_system *coding;
5314      int nocopy;
5315 {
5316   int len;
5317   struct conversion_buffer buf;
5318   int from, to, to_byte;
5319   struct gcpro gcpro1;
5320   Lisp_Object saved_coding_symbol;
5321   int result;
5322   int require_decoding;
5323   int shrinked_bytes = 0;
5324   Lisp_Object newstr;
5325   int consumed, consumed_char, produced, produced_char;
5326
5327   from = 0;
5328   to = XSTRING (str)->size;
5329   to_byte = STRING_BYTES (XSTRING (str));
5330
5331   saved_coding_symbol = Qnil;
5332   coding->src_multibyte = STRING_MULTIBYTE (str);
5333   coding->dst_multibyte = 1;
5334   if (CODING_REQUIRE_DETECTION (coding))
5335     {
5336       /* See the comments in code_convert_region.  */
5337       if (coding->type == coding_type_undecided)
5338         {
5339           detect_coding (coding, XSTRING (str)->data, to_byte);
5340           if (coding->type == coding_type_undecided)
5341             coding->type = coding_type_emacs_mule;
5342         }
5343       if (coding->eol_type == CODING_EOL_UNDECIDED
5344           && coding->type != coding_type_ccl)
5345         {
5346           saved_coding_symbol = coding->symbol;
5347           detect_eol (coding, XSTRING (str)->data, to_byte);
5348           if (coding->eol_type == CODING_EOL_UNDECIDED)
5349             coding->eol_type = CODING_EOL_LF;
5350           /* We had better recover the original eol format if we
5351              encounter an inconsitent eol format while decoding.  */
5352           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5353         }
5354     }
5355
5356   if (coding->type == coding_type_no_conversion
5357       || coding->type == coding_type_raw_text)
5358     coding->dst_multibyte = 0;
5359
5360   require_decoding = CODING_REQUIRE_DECODING (coding);
5361
5362   if (STRING_MULTIBYTE (str))
5363     {
5364       /* Decoding routines expect the source text to be unibyte.  */
5365       str = Fstring_as_unibyte (str);
5366       to_byte = STRING_BYTES (XSTRING (str));
5367       nocopy = 1;
5368       coding->src_multibyte = 0;
5369     }
5370
5371   /* Try to skip the heading and tailing ASCIIs.  */
5372   if (require_decoding && coding->type != coding_type_ccl)
5373     {
5374       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5375                                 0);
5376       if (from == to_byte)
5377         require_decoding = 0;
5378       shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte);
5379     }
5380
5381   if (!require_decoding)
5382     {
5383       coding->consumed = STRING_BYTES (XSTRING (str));
5384       coding->consumed_char = XSTRING (str)->size;
5385       if (coding->dst_multibyte)
5386         {
5387           str = Fstring_as_multibyte (str);
5388           nocopy = 1;
5389         }
5390       coding->produced = STRING_BYTES (XSTRING (str));
5391       coding->produced_char = XSTRING (str)->size;
5392       return (nocopy ? str : Fcopy_sequence (str));
5393     }
5394
5395   if (coding->composing != COMPOSITION_DISABLED)
5396     coding_allocate_composition_data (coding, from);
5397   len = decoding_buffer_size (coding, to_byte - from);
5398   allocate_conversion_buffer (buf, len);
5399
5400   consumed = consumed_char = produced = produced_char = 0;
5401   while (1)
5402     {
5403       result = decode_coding (coding, XSTRING (str)->data + from + consumed,
5404                               buf.data + produced, to_byte - from - consumed,
5405                               buf.size - produced);
5406       consumed += coding->consumed;
5407       consumed_char += coding->consumed_char;
5408       produced += coding->produced;
5409       produced_char += coding->produced_char;
5410       if (result == CODING_FINISH_NORMAL
5411           || (result == CODING_FINISH_INSUFFICIENT_SRC
5412               && coding->consumed == 0))
5413         break;
5414       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5415         coding_allocate_composition_data (coding, from + produced_char);
5416       else if (result == CODING_FINISH_INSUFFICIENT_DST)
5417         extend_conversion_buffer (&buf);
5418       else if (result == CODING_FINISH_INCONSISTENT_EOL)
5419         {
5420           /* Recover the original EOL format.  */
5421           if (coding->eol_type == CODING_EOL_CR)
5422             {
5423               unsigned char *p;
5424               for (p = buf.data; p < buf.data + produced; p++)
5425                 if (*p == '\n') *p = '\r';
5426             }
5427           else if (coding->eol_type == CODING_EOL_CRLF)
5428             {
5429               int num_eol = 0;
5430               unsigned char *p0, *p1;
5431               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
5432                 if (*p0 == '\n') num_eol++;
5433               if (produced + num_eol >= buf.size)
5434                 extend_conversion_buffer (&buf);
5435               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
5436                 {
5437                   *--p1 = *--p0;
5438                   if (*p0 == '\n') *--p1 = '\r';
5439                 }
5440               produced += num_eol;
5441               produced_char += num_eol;
5442             }
5443           coding->eol_type = CODING_EOL_LF;
5444           coding->symbol = saved_coding_symbol;
5445         }
5446     }
5447
5448   coding->consumed = consumed;
5449   coding->consumed_char = consumed_char;
5450   coding->produced = produced;
5451   coding->produced_char = produced_char;
5452
5453   if (coding->dst_multibyte)
5454     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
5455                                            produced + shrinked_bytes);
5456   else
5457     newstr = make_uninit_string (produced + shrinked_bytes);
5458   if (from > 0)
5459     bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from);
5460   bcopy (buf.data, XSTRING (newstr)->data + from, produced);
5461   if (shrinked_bytes > from)
5462     bcopy (XSTRING (str)->data + to_byte,
5463            XSTRING (newstr)->data + from + produced,
5464            shrinked_bytes - from);
5465   free_conversion_buffer (&buf);
5466
5467   if (coding->cmp_data && coding->cmp_data->used)
5468     coding_restore_composition (coding, newstr);
5469   coding_free_composition_data (coding);
5470
5471   if (SYMBOLP (coding->post_read_conversion)
5472       && !NILP (Ffboundp (coding->post_read_conversion)))
5473     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
5474
5475   return newstr;
5476 }
5477
5478 Lisp_Object
5479 encode_coding_string (str, coding, nocopy)
5480      Lisp_Object str;
5481      struct coding_system *coding;
5482      int nocopy;
5483 {
5484   int len;
5485   struct conversion_buffer buf;
5486   int from, to, to_byte;
5487   struct gcpro gcpro1;
5488   Lisp_Object saved_coding_symbol;
5489   int result;
5490   int shrinked_bytes = 0;
5491   Lisp_Object newstr;
5492   int consumed, consumed_char, produced, produced_char;
5493
5494   if (SYMBOLP (coding->pre_write_conversion)
5495       && !NILP (Ffboundp (coding->pre_write_conversion)))
5496     str = run_pre_post_conversion_on_str (str, coding, 1);
5497
5498   from = 0;
5499   to = XSTRING (str)->size;
5500   to_byte = STRING_BYTES (XSTRING (str));
5501
5502   saved_coding_symbol = Qnil;
5503
5504   /* Encoding routines determine the multibyteness of the source text
5505      by coding->src_multibyte.  */
5506   coding->src_multibyte = STRING_MULTIBYTE (str);
5507   coding->dst_multibyte = 0;
5508   if (! CODING_REQUIRE_ENCODING (coding))
5509     {
5510       coding->consumed = STRING_BYTES (XSTRING (str));
5511       coding->consumed_char = XSTRING (str)->size;
5512       if (STRING_MULTIBYTE (str))
5513         {
5514           str = Fstring_as_unibyte (str);
5515           nocopy = 1;
5516         }
5517       coding->produced = STRING_BYTES (XSTRING (str));
5518       coding->produced_char = XSTRING (str)->size;
5519       return (nocopy ? str : Fcopy_sequence (str));
5520     }
5521
5522   if (coding->composing != COMPOSITION_DISABLED)
5523     coding_save_composition (coding, from, to, str);
5524
5525   /* Try to skip the heading and tailing ASCIIs.  */
5526   if (coding->type != coding_type_ccl)
5527     {
5528       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5529                                 1);
5530       if (from == to_byte)
5531         return (nocopy ? str : Fcopy_sequence (str));
5532       shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte);
5533     }
5534
5535   len = encoding_buffer_size (coding, to_byte - from);
5536   allocate_conversion_buffer (buf, len);
5537
5538   consumed = consumed_char = produced = produced_char = 0;
5539   while (1)
5540     {
5541       result = encode_coding (coding, XSTRING (str)->data + from + consumed,
5542                               buf.data + produced, to_byte - from - consumed,
5543                               buf.size - produced);
5544       consumed += coding->consumed;
5545       consumed_char += coding->consumed_char;
5546       produced += coding->produced;
5547       produced_char += coding->produced_char;
5548       if (result == CODING_FINISH_NORMAL
5549           || (result == CODING_FINISH_INSUFFICIENT_SRC
5550               && coding->consumed == 0))
5551         break;
5552       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
5553       extend_conversion_buffer (&buf);
5554     }
5555
5556   coding->consumed = consumed;
5557   coding->consumed_char = consumed_char;
5558   coding->produced = produced;
5559   coding->produced_char = produced_char;
5560
5561   newstr = make_uninit_string (produced + shrinked_bytes);
5562   if (from > 0)
5563     bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from);
5564   bcopy (buf.data, XSTRING (newstr)->data + from, produced);
5565   if (shrinked_bytes > from)
5566     bcopy (XSTRING (str)->data + to_byte,
5567            XSTRING (newstr)->data + from + produced,
5568            shrinked_bytes - from);
5569
5570   free_conversion_buffer (&buf);
5571   coding_free_composition_data (coding);
5572
5573   return newstr;
5574 }
5575
5576 \f
5577 #ifdef emacs
5578 /*** 8. Emacs Lisp library functions ***/
5579
5580 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5581   "Return t if OBJECT is nil or a coding-system.\n\
5582 See the documentation of `make-coding-system' for information\n\
5583 about coding-system objects.")
5584   (obj)
5585      Lisp_Object obj;
5586 {
5587   if (NILP (obj))
5588     return Qt;
5589   if (!SYMBOLP (obj))
5590     return Qnil;
5591   /* Get coding-spec vector for OBJ.  */
5592   obj = Fget (obj, Qcoding_system);
5593   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5594           ? Qt : Qnil);
5595 }
5596
5597 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5598        Sread_non_nil_coding_system, 1, 1, 0,
5599   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5600   (prompt)
5601      Lisp_Object prompt;
5602 {
5603   Lisp_Object val;
5604   do
5605     {
5606       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5607                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5608     }
5609   while (XSTRING (val)->size == 0);
5610   return (Fintern (val, Qnil));
5611 }
5612
5613 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5614   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5615 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5616   (prompt, default_coding_system)
5617      Lisp_Object prompt, default_coding_system;
5618 {
5619   Lisp_Object val;
5620   if (SYMBOLP (default_coding_system))
5621     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5622   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5623                           Qt, Qnil, Qcoding_system_history,
5624                           default_coding_system, Qnil);
5625   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5626 }
5627
5628 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5629        1, 1, 0,
5630   "Check validity of CODING-SYSTEM.\n\
5631 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5632 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5633 The value of property should be a vector of length 5.")
5634   (coding_system)
5635      Lisp_Object coding_system;
5636 {
5637   CHECK_SYMBOL (coding_system, 0);
5638   if (!NILP (Fcoding_system_p (coding_system)))
5639     return coding_system;
5640   while (1)
5641     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5642 }
5643 \f
5644 Lisp_Object
5645 detect_coding_system (src, src_bytes, highest, multibytep)
5646      unsigned char *src;
5647      int src_bytes, highest;
5648      int multibytep;
5649 {
5650   int coding_mask, eol_type;
5651   Lisp_Object val, tmp;
5652   int dummy;
5653
5654   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
5655   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5656   if (eol_type == CODING_EOL_INCONSISTENT)
5657     eol_type = CODING_EOL_UNDECIDED;
5658
5659   if (!coding_mask)
5660     {
5661       val = Qundecided;
5662       if (eol_type != CODING_EOL_UNDECIDED)
5663         {
5664           Lisp_Object val2;
5665           val2 = Fget (Qundecided, Qeol_type);
5666           if (VECTORP (val2))
5667             val = XVECTOR (val2)->contents[eol_type];
5668         }
5669       return (highest ? val : Fcons (val, Qnil));
5670     }
5671
5672   /* At first, gather possible coding systems in VAL.  */
5673   val = Qnil;
5674   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5675     {
5676       Lisp_Object category_val, category_index;
5677
5678       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5679       category_val = Fsymbol_value (XCAR (tmp));
5680       if (!NILP (category_val)
5681           && NATNUMP (category_index)
5682           && (coding_mask & (1 << XFASTINT (category_index))))
5683         {
5684           val = Fcons (category_val, val);
5685           if (highest)
5686             break;
5687         }
5688     }
5689   if (!highest)
5690     val = Fnreverse (val);
5691
5692   /* Then, replace the elements with subsidiary coding systems.  */
5693   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5694     {
5695       if (eol_type != CODING_EOL_UNDECIDED
5696           && eol_type != CODING_EOL_INCONSISTENT)
5697         {
5698           Lisp_Object eol;
5699           eol = Fget (XCAR (tmp), Qeol_type);
5700           if (VECTORP (eol))
5701             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5702         }
5703     }
5704   return (highest ? XCAR (val) : val);
5705 }
5706
5707 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5708        2, 3, 0,
5709   "Detect coding system of the text in the region between START and END.\n\
5710 Return a list of possible coding systems ordered by priority.\n\
5711 \n\
5712 If only ASCII characters are found, it returns a list of single element\n\
5713 `undecided' or its subsidiary coding system according to a detected\n\
5714 end-of-line format.\n\
5715 \n\
5716 If optional argument HIGHEST is non-nil, return the coding system of\n\
5717 highest priority.")
5718   (start, end, highest)
5719      Lisp_Object start, end, highest;
5720 {
5721   int from, to;
5722   int from_byte, to_byte;
5723
5724   CHECK_NUMBER_COERCE_MARKER (start, 0);
5725   CHECK_NUMBER_COERCE_MARKER (end, 1);
5726
5727   validate_region (&start, &end);
5728   from = XINT (start), to = XINT (end);
5729   from_byte = CHAR_TO_BYTE (from);
5730   to_byte = CHAR_TO_BYTE (to);
5731
5732   if (from < GPT && to >= GPT)
5733     move_gap_both (to, to_byte);
5734
5735   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5736                                to_byte - from_byte,
5737                                !NILP (highest),
5738                                !NILP (current_buffer
5739                                       ->enable_multibyte_characters));
5740 }
5741
5742 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5743        1, 2, 0,
5744   "Detect coding system of the text in STRING.\n\
5745 Return a list of possible coding systems ordered by priority.\n\
5746 \n\
5747 If only ASCII characters are found, it returns a list of single element\n\
5748 `undecided' or its subsidiary coding system according to a detected\n\
5749 end-of-line format.\n\
5750 \n\
5751 If optional argument HIGHEST is non-nil, return the coding system of\n\
5752 highest priority.")
5753   (string, highest)
5754      Lisp_Object string, highest;
5755 {
5756   CHECK_STRING (string, 0);
5757
5758   return detect_coding_system (XSTRING (string)->data,
5759                                STRING_BYTES (XSTRING (string)),
5760                                !NILP (highest),
5761                                STRING_MULTIBYTE (string));
5762 }
5763
5764 /* Return an intersection of lists L1 and L2.  */
5765
5766 static Lisp_Object
5767 intersection (l1, l2)
5768      Lisp_Object l1, l2;
5769 {
5770   Lisp_Object val;
5771
5772   for (val = Qnil; CONSP (l1); l1 = XCDR (l1))
5773     {
5774       if (!NILP (Fmemq (XCAR (l1), l2)))
5775         val = Fcons (XCAR (l1), val);
5776     }
5777   return val;
5778 }
5779
5780
5781 /*  Subroutine for Fsafe_coding_systems_region_internal.
5782
5783     Return a list of coding systems that safely encode the multibyte
5784     text between P and PEND.  SAFE_CODINGS, if non-nil, is a list of
5785     possible coding systems.  If it is nil, it means that we have not
5786     yet found any coding systems.
5787
5788     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
5789     element of WORK_TABLE is set to t once the element is looked up.
5790
5791     If a non-ASCII single byte char is found, set
5792     *single_byte_char_found to 1.  */
5793
5794 static Lisp_Object
5795 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
5796      unsigned char *p, *pend;
5797      Lisp_Object safe_codings, work_table;
5798      int *single_byte_char_found;
5799 {
5800   int c, len, idx;
5801   Lisp_Object val;
5802
5803   while (p < pend)
5804     {
5805       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
5806       p += len;
5807       if (ASCII_BYTE_P (c))
5808         /* We can ignore ASCII characters here.  */
5809         continue;
5810       if (SINGLE_BYTE_CHAR_P (c))
5811         *single_byte_char_found = 1;
5812       if (NILP (safe_codings))
5813         continue;
5814       /* Check the safe coding systems for C.  */
5815       val = char_table_ref_and_index (work_table, c, &idx);
5816       if (EQ (val, Qt))
5817         /* This element was already checked.  Ignore it.  */
5818         continue;
5819       /* Remember that we checked this element.  */
5820       CHAR_TABLE_SET (work_table, make_number (idx), Qt);
5821
5822       /* If there are some safe coding systems for C and we have
5823          already found the other set of coding systems for the
5824          different characters, get the intersection of them.  */
5825       if (!EQ (safe_codings, Qt) && !NILP (val))
5826         val = intersection (safe_codings, val);
5827       safe_codings = val;
5828     }
5829   return safe_codings;
5830 }
5831
5832
5833 /* Return a list of coding systems that safely encode the text between
5834    START and END.  If the text contains only ASCII or is unibyte,
5835    return t.  */
5836
5837 DEFUN ("find-coding-systems-region-internal",
5838        Ffind_coding_systems_region_internal,
5839        Sfind_coding_systems_region_internal, 2, 2, 0,
5840   "Internal use only.")
5841   (start, end)
5842      Lisp_Object start, end;
5843 {
5844   Lisp_Object work_table, safe_codings;
5845   int non_ascii_p = 0;
5846   int single_byte_char_found = 0;
5847   unsigned char *p1, *p1end, *p2, *p2end, *p;
5848   Lisp_Object args[2];
5849
5850   if (STRINGP (start))
5851     {
5852       if (!STRING_MULTIBYTE (start))
5853         return Qt;
5854       p1 = XSTRING (start)->data, p1end = p1 + STRING_BYTES (XSTRING (start));
5855       p2 = p2end = p1end;
5856       if (XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
5857         non_ascii_p = 1;
5858     }
5859   else
5860     {
5861       int from, to, stop;
5862
5863       CHECK_NUMBER_COERCE_MARKER (start, 0);
5864       CHECK_NUMBER_COERCE_MARKER (end, 1);
5865       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
5866         args_out_of_range (start, end);
5867       if (NILP (current_buffer->enable_multibyte_characters))
5868         return Qt;
5869       from = CHAR_TO_BYTE (XINT (start));
5870       to = CHAR_TO_BYTE (XINT (end));
5871       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
5872       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
5873       if (stop == to)
5874         p2 = p2end = p1end;
5875       else
5876         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
5877       if (XINT (end) - XINT (start) != to - from)
5878         non_ascii_p = 1;
5879     }
5880
5881   if (!non_ascii_p)
5882     {
5883       /* We are sure that the text contains no multibyte character.
5884          Check if it contains eight-bit-graphic.  */
5885       p = p1;
5886       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
5887       if (p == p1end)
5888         {
5889           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
5890           if (p == p2end)
5891             return Qt;
5892         }
5893     }
5894
5895   /* The text contains non-ASCII characters.  */
5896   work_table = Fcopy_sequence (Vchar_coding_system_table);
5897   safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
5898                                     &single_byte_char_found);
5899   if (p2 < p2end)
5900     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
5901                                       &single_byte_char_found);
5902
5903   if (!single_byte_char_found)
5904     {
5905       /* Append generic coding systems.  */
5906       Lisp_Object args[2];
5907       args[0] = safe_codings;
5908       args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
5909                                         make_number (0));
5910       safe_codings = Fappend (2, args);
5911     }
5912   else
5913     safe_codings = Fcons (Qraw_text,
5914                           Fcons (Qemacs_mule,
5915                                  Fcons (Qno_conversion, safe_codings)));
5916   return safe_codings;
5917 }
5918
5919
5920 Lisp_Object
5921 code_convert_region1 (start, end, coding_system, encodep)
5922      Lisp_Object start, end, coding_system;
5923      int encodep;
5924 {
5925   struct coding_system coding;
5926   int from, to, len;
5927
5928   CHECK_NUMBER_COERCE_MARKER (start, 0);
5929   CHECK_NUMBER_COERCE_MARKER (end, 1);
5930   CHECK_SYMBOL (coding_system, 2);
5931
5932   validate_region (&start, &end);
5933   from = XFASTINT (start);
5934   to = XFASTINT (end);
5935
5936   if (NILP (coding_system))
5937     return make_number (to - from);
5938
5939   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5940     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5941
5942   coding.mode |= CODING_MODE_LAST_BLOCK;
5943   coding.src_multibyte = coding.dst_multibyte
5944     = !NILP (current_buffer->enable_multibyte_characters);
5945   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5946                        &coding, encodep, 1);
5947   Vlast_coding_system_used = coding.symbol;
5948   return make_number (coding.produced_char);
5949 }
5950
5951 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5952        3, 3, "r\nzCoding system: ",
5953   "Decode the current region by specified coding system.\n\
5954 When called from a program, takes three arguments:\n\
5955 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5956 This function sets `last-coding-system-used' to the precise coding system\n\
5957 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5958 not fully specified.)\n\
5959 It returns the length of the decoded text.")
5960   (start, end, coding_system)
5961      Lisp_Object start, end, coding_system;
5962 {
5963   return code_convert_region1 (start, end, coding_system, 0);
5964 }
5965
5966 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5967        3, 3, "r\nzCoding system: ",
5968   "Encode the current region by specified coding system.\n\
5969 When called from a program, takes three arguments:\n\
5970 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5971 This function sets `last-coding-system-used' to the precise coding system\n\
5972 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5973 not fully specified.)\n\
5974 It returns the length of the encoded text.")
5975   (start, end, coding_system)
5976      Lisp_Object start, end, coding_system;
5977 {
5978   return code_convert_region1 (start, end, coding_system, 1);
5979 }
5980
5981 Lisp_Object
5982 code_convert_string1 (string, coding_system, nocopy, encodep)
5983      Lisp_Object string, coding_system, nocopy;
5984      int encodep;
5985 {
5986   struct coding_system coding;
5987
5988   CHECK_STRING (string, 0);
5989   CHECK_SYMBOL (coding_system, 1);
5990
5991   if (NILP (coding_system))
5992     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5993
5994   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5995     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5996
5997   coding.mode |= CODING_MODE_LAST_BLOCK;
5998   string = (encodep
5999             ? encode_coding_string (string, &coding, !NILP (nocopy))
6000             : decode_coding_string (string, &coding, !NILP (nocopy)));
6001   Vlast_coding_system_used = coding.symbol;
6002
6003   return string;
6004 }
6005
6006 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6007        2, 3, 0,
6008   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
6009 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
6010 if the decoding operation is trivial.\n\
6011 This function sets `last-coding-system-used' to the precise coding system\n\
6012 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
6013 not fully specified.)")
6014   (string, coding_system, nocopy)
6015      Lisp_Object string, coding_system, nocopy;
6016 {
6017   return code_convert_string1 (string, coding_system, nocopy, 0);
6018 }
6019
6020 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6021        2, 3, 0,
6022   "Encode STRING to CODING-SYSTEM, and return the result.\n\
6023 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
6024 if the encoding operation is trivial.\n\
6025 This function sets `last-coding-system-used' to the precise coding system\n\
6026 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
6027 not fully specified.)")
6028   (string, coding_system, nocopy)
6029      Lisp_Object string, coding_system, nocopy;
6030 {
6031   return code_convert_string1 (string, coding_system, nocopy, 1);
6032 }
6033
6034 /* Encode or decode STRING according to CODING_SYSTEM.
6035    Do not set Vlast_coding_system_used.
6036
6037    This function is called only from macros DECODE_FILE and
6038    ENCODE_FILE, thus we ignore character composition.  */
6039
6040 Lisp_Object
6041 code_convert_string_norecord (string, coding_system, encodep)
6042      Lisp_Object string, coding_system;
6043      int encodep;
6044 {
6045   struct coding_system coding;
6046
6047   CHECK_STRING (string, 0);
6048   CHECK_SYMBOL (coding_system, 1);
6049
6050   if (NILP (coding_system))
6051     return string;
6052
6053   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6054     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
6055
6056   coding.composing = COMPOSITION_DISABLED;
6057   coding.mode |= CODING_MODE_LAST_BLOCK;
6058   return (encodep
6059           ? encode_coding_string (string, &coding, 1)
6060           : decode_coding_string (string, &coding, 1));
6061 }
6062 \f
6063 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6064   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
6065 Return the corresponding character.")
6066   (code)
6067      Lisp_Object code;
6068 {
6069   unsigned char c1, c2, s1, s2;
6070   Lisp_Object val;
6071
6072   CHECK_NUMBER (code, 0);
6073   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6074   if (s1 == 0)
6075     {
6076       if (s2 < 0x80)
6077         XSETFASTINT (val, s2);
6078       else if (s2 >= 0xA0 || s2 <= 0xDF)
6079         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
6080       else
6081         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6082     }
6083   else
6084     {
6085       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
6086           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
6087         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6088       DECODE_SJIS (s1, s2, c1, c2);
6089       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
6090     }
6091   return val;
6092 }
6093
6094 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6095   "Encode a Japanese character CHAR to shift_jis encoding.\n\
6096 Return the corresponding code in SJIS.")
6097   (ch)
6098      Lisp_Object ch;
6099 {
6100   int charset, c1, c2, s1, s2;
6101   Lisp_Object val;
6102
6103   CHECK_NUMBER (ch, 0);
6104   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6105   if (charset == CHARSET_ASCII)
6106     {
6107       val = ch;
6108     }
6109   else if (charset == charset_jisx0208
6110            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
6111     {
6112       ENCODE_SJIS (c1, c2, s1, s2);
6113       XSETFASTINT (val, (s1 << 8) | s2);
6114     }
6115   else if (charset == charset_katakana_jisx0201
6116            && c1 > 0x20 && c2 < 0xE0)
6117     {
6118       XSETFASTINT (val, c1 | 0x80);
6119     }
6120   else
6121     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
6122   return val;
6123 }
6124
6125 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6126   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
6127 Return the corresponding character.")
6128   (code)
6129      Lisp_Object code;
6130 {
6131   int charset;
6132   unsigned char b1, b2, c1, c2;
6133   Lisp_Object val;
6134
6135   CHECK_NUMBER (code, 0);
6136   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
6137   if (b1 == 0)
6138     {
6139       if (b2 >= 0x80)
6140         error ("Invalid BIG5 code: %x", XFASTINT (code));
6141       val = code;
6142     }
6143   else
6144     {
6145       if ((b1 < 0xA1 || b1 > 0xFE)
6146           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
6147         error ("Invalid BIG5 code: %x", XFASTINT (code));
6148       DECODE_BIG5 (b1, b2, charset, c1, c2);
6149       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
6150     }
6151   return val;
6152 }
6153
6154 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6155   "Encode the Big5 character CHAR to BIG5 coding system.\n\
6156 Return the corresponding character code in Big5.")
6157   (ch)
6158      Lisp_Object ch;
6159 {
6160   int charset, c1, c2, b1, b2;
6161   Lisp_Object val;
6162
6163   CHECK_NUMBER (ch, 0);
6164   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6165   if (charset == CHARSET_ASCII)
6166     {
6167       val = ch;
6168     }
6169   else if ((charset == charset_big5_1
6170             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
6171            || (charset == charset_big5_2
6172                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
6173     {
6174       ENCODE_BIG5 (charset, c1, c2, b1, b2);
6175       XSETFASTINT (val, (b1 << 8) | b2);
6176     }
6177   else
6178     error ("Can't encode to Big5: %d", XFASTINT (ch));
6179   return val;
6180 }
6181 \f
6182 DEFUN ("set-terminal-coding-system-internal",
6183        Fset_terminal_coding_system_internal,
6184        Sset_terminal_coding_system_internal, 1, 1, 0, "")
6185   (coding_system)
6186      Lisp_Object coding_system;
6187 {
6188   CHECK_SYMBOL (coding_system, 0);
6189   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6190   /* We had better not send unsafe characters to terminal.  */
6191   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
6192   /* Characer composition should be disabled.  */
6193   terminal_coding.composing = COMPOSITION_DISABLED;
6194   terminal_coding.src_multibyte = 1;
6195   terminal_coding.dst_multibyte = 0;
6196   return Qnil;
6197 }
6198
6199 DEFUN ("set-safe-terminal-coding-system-internal",
6200        Fset_safe_terminal_coding_system_internal,
6201        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
6202   (coding_system)
6203      Lisp_Object coding_system;
6204 {
6205   CHECK_SYMBOL (coding_system, 0);
6206   setup_coding_system (Fcheck_coding_system (coding_system),
6207                        &safe_terminal_coding);
6208   /* Characer composition should be disabled.  */
6209   safe_terminal_coding.composing = COMPOSITION_DISABLED;
6210   safe_terminal_coding.src_multibyte = 1;
6211   safe_terminal_coding.dst_multibyte = 0;
6212   return Qnil;
6213 }
6214
6215 DEFUN ("terminal-coding-system",
6216        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6217   "Return coding system specified for terminal output.")
6218   ()
6219 {
6220   return terminal_coding.symbol;
6221 }
6222
6223 DEFUN ("set-keyboard-coding-system-internal",
6224        Fset_keyboard_coding_system_internal,
6225        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
6226   (coding_system)
6227      Lisp_Object coding_system;
6228 {
6229   CHECK_SYMBOL (coding_system, 0);
6230   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
6231   /* Characer composition should be disabled.  */
6232   keyboard_coding.composing = COMPOSITION_DISABLED;
6233   return Qnil;
6234 }
6235
6236 DEFUN ("keyboard-coding-system",
6237        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6238   "Return coding system specified for decoding keyboard input.")
6239   ()
6240 {
6241   return keyboard_coding.symbol;
6242 }
6243
6244 \f
6245 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6246        Sfind_operation_coding_system,  1, MANY, 0,
6247   "Choose a coding system for an operation based on the target name.\n\
6248 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
6249 DECODING-SYSTEM is the coding system to use for decoding\n\
6250 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
6251 for encoding (in case OPERATION does encoding).\n\
6252 \n\
6253 The first argument OPERATION specifies an I/O primitive:\n\
6254   For file I/O, `insert-file-contents' or `write-region'.\n\
6255   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
6256   For network I/O, `open-network-stream'.\n\
6257 \n\
6258 The remaining arguments should be the same arguments that were passed\n\
6259 to the primitive.  Depending on which primitive, one of those arguments\n\
6260 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
6261 whichever argument specifies the file name is TARGET.\n\
6262 \n\
6263 TARGET has a meaning which depends on OPERATION:\n\
6264   For file I/O, TARGET is a file name.\n\
6265   For process I/O, TARGET is a process name.\n\
6266   For network I/O, TARGET is a service name or a port number\n\
6267 \n\
6268 This function looks up what specified for TARGET in,\n\
6269 `file-coding-system-alist', `process-coding-system-alist',\n\
6270 or `network-coding-system-alist' depending on OPERATION.\n\
6271 They may specify a coding system, a cons of coding systems,\n\
6272 or a function symbol to call.\n\
6273 In the last case, we call the function with one argument,\n\
6274 which is a list of all the arguments given to this function.")
6275   (nargs, args)
6276      int nargs;
6277      Lisp_Object *args;
6278 {
6279   Lisp_Object operation, target_idx, target, val;
6280   register Lisp_Object chain;
6281
6282   if (nargs < 2)
6283     error ("Too few arguments");
6284   operation = args[0];
6285   if (!SYMBOLP (operation)
6286       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
6287     error ("Invalid first arguement");
6288   if (nargs < 1 + XINT (target_idx))
6289     error ("Too few arguments for operation: %s",
6290            XSYMBOL (operation)->name->data);
6291   target = args[XINT (target_idx) + 1];
6292   if (!(STRINGP (target)
6293         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
6294     error ("Invalid %dth argument", XINT (target_idx) + 1);
6295
6296   chain = ((EQ (operation, Qinsert_file_contents)
6297             || EQ (operation, Qwrite_region))
6298            ? Vfile_coding_system_alist
6299            : (EQ (operation, Qopen_network_stream)
6300               ? Vnetwork_coding_system_alist
6301               : Vprocess_coding_system_alist));
6302   if (NILP (chain))
6303     return Qnil;
6304
6305   for (; CONSP (chain); chain = XCDR (chain))
6306     {
6307       Lisp_Object elt;
6308       elt = XCAR (chain);
6309
6310       if (CONSP (elt)
6311           && ((STRINGP (target)
6312                && STRINGP (XCAR (elt))
6313                && fast_string_match (XCAR (elt), target) >= 0)
6314               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6315         {
6316           val = XCDR (elt);
6317           /* Here, if VAL is both a valid coding system and a valid
6318              function symbol, we return VAL as a coding system.  */
6319           if (CONSP (val))
6320             return val;
6321           if (! SYMBOLP (val))
6322             return Qnil;
6323           if (! NILP (Fcoding_system_p (val)))
6324             return Fcons (val, val);
6325           if (! NILP (Ffboundp (val)))
6326             {
6327               val = call1 (val, Flist (nargs, args));
6328               if (CONSP (val))
6329                 return val;
6330               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
6331                 return Fcons (val, val);
6332             }
6333           return Qnil;
6334         }
6335     }
6336   return Qnil;
6337 }
6338
6339 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
6340        Supdate_coding_systems_internal, 0, 0, 0,
6341   "Update internal database for ISO2022 and CCL based coding systems.\n\
6342 When values of any coding categories are changed, you must\n\
6343 call this function")
6344   ()
6345 {
6346   int i;
6347
6348   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
6349     {
6350       Lisp_Object val;
6351
6352       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
6353       if (!NILP (val))
6354         {
6355           if (! coding_system_table[i])
6356             coding_system_table[i] = ((struct coding_system *)
6357                                       xmalloc (sizeof (struct coding_system)));
6358           setup_coding_system (val, coding_system_table[i]);
6359         }
6360       else if (coding_system_table[i])
6361         {
6362           xfree (coding_system_table[i]);
6363           coding_system_table[i] = NULL;
6364         }
6365     }
6366
6367   return Qnil;
6368 }
6369
6370 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
6371        Sset_coding_priority_internal, 0, 0, 0,
6372   "Update internal database for the current value of `coding-category-list'.\n\
6373 This function is internal use only.")
6374   ()
6375 {
6376   int i = 0, idx;
6377   Lisp_Object val;
6378
6379   val = Vcoding_category_list;
6380
6381   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
6382     {
6383       if (! SYMBOLP (XCAR (val)))
6384         break;
6385       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
6386       if (idx >= CODING_CATEGORY_IDX_MAX)
6387         break;
6388       coding_priorities[i++] = (1 << idx);
6389       val = XCDR (val);
6390     }
6391   /* If coding-category-list is valid and contains all coding
6392      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
6393      the following code saves Emacs from crashing.  */
6394   while (i < CODING_CATEGORY_IDX_MAX)
6395     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6396
6397   return Qnil;
6398 }
6399
6400 #endif /* emacs */
6401
6402 \f
6403 /*** 9. Post-amble ***/
6404
6405 void
6406 init_coding_once ()
6407 {
6408   int i;
6409
6410   /* Emacs' internal format specific initialize routine.  */
6411   for (i = 0; i <= 0x20; i++)
6412     emacs_code_class[i] = EMACS_control_code;
6413   emacs_code_class[0x0A] = EMACS_linefeed_code;
6414   emacs_code_class[0x0D] = EMACS_carriage_return_code;
6415   for (i = 0x21 ; i < 0x7F; i++)
6416     emacs_code_class[i] = EMACS_ascii_code;
6417   emacs_code_class[0x7F] = EMACS_control_code;
6418   for (i = 0x80; i < 0xFF; i++)
6419     emacs_code_class[i] = EMACS_invalid_code;
6420   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6421   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6422   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6423   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6424
6425   /* ISO2022 specific initialize routine.  */
6426   for (i = 0; i < 0x20; i++)
6427     iso_code_class[i] = ISO_control_0;
6428   for (i = 0x21; i < 0x7F; i++)
6429     iso_code_class[i] = ISO_graphic_plane_0;
6430   for (i = 0x80; i < 0xA0; i++)
6431     iso_code_class[i] = ISO_control_1;
6432   for (i = 0xA1; i < 0xFF; i++)
6433     iso_code_class[i] = ISO_graphic_plane_1;
6434   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6435   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6436   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6437   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6438   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6439   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6440   iso_code_class[ISO_CODE_ESC] = ISO_escape;
6441   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6442   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6443   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6444
6445   setup_coding_system (Qnil, &keyboard_coding);
6446   setup_coding_system (Qnil, &terminal_coding);
6447   setup_coding_system (Qnil, &safe_terminal_coding);
6448   setup_coding_system (Qnil, &default_buffer_file_coding);
6449
6450   bzero (coding_system_table, sizeof coding_system_table);
6451
6452   bzero (ascii_skip_code, sizeof ascii_skip_code);
6453   for (i = 0; i < 128; i++)
6454     ascii_skip_code[i] = 1;
6455
6456 #if defined (MSDOS) || defined (WINDOWSNT)
6457   system_eol_type = CODING_EOL_CRLF;
6458 #else
6459   system_eol_type = CODING_EOL_LF;
6460 #endif
6461
6462   inhibit_pre_post_conversion = 0;
6463 }
6464
6465 #ifdef emacs
6466
6467 void
6468 syms_of_coding ()
6469 {
6470   Qtarget_idx = intern ("target-idx");
6471   staticpro (&Qtarget_idx);
6472
6473   Qcoding_system_history = intern ("coding-system-history");
6474   staticpro (&Qcoding_system_history);
6475   Fset (Qcoding_system_history, Qnil);
6476
6477   /* Target FILENAME is the first argument.  */
6478   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6479   /* Target FILENAME is the third argument.  */
6480   Fput (Qwrite_region, Qtarget_idx, make_number (2));
6481
6482   Qcall_process = intern ("call-process");
6483   staticpro (&Qcall_process);
6484   /* Target PROGRAM is the first argument.  */
6485   Fput (Qcall_process, Qtarget_idx, make_number (0));
6486
6487   Qcall_process_region = intern ("call-process-region");
6488   staticpro (&Qcall_process_region);
6489   /* Target PROGRAM is the third argument.  */
6490   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6491
6492   Qstart_process = intern ("start-process");
6493   staticpro (&Qstart_process);
6494   /* Target PROGRAM is the third argument.  */
6495   Fput (Qstart_process, Qtarget_idx, make_number (2));
6496
6497   Qopen_network_stream = intern ("open-network-stream");
6498   staticpro (&Qopen_network_stream);
6499   /* Target SERVICE is the fourth argument.  */
6500   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6501
6502   Qcoding_system = intern ("coding-system");
6503   staticpro (&Qcoding_system);
6504
6505   Qeol_type = intern ("eol-type");
6506   staticpro (&Qeol_type);
6507
6508   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6509   staticpro (&Qbuffer_file_coding_system);
6510
6511   Qpost_read_conversion = intern ("post-read-conversion");
6512   staticpro (&Qpost_read_conversion);
6513
6514   Qpre_write_conversion = intern ("pre-write-conversion");
6515   staticpro (&Qpre_write_conversion);
6516
6517   Qno_conversion = intern ("no-conversion");
6518   staticpro (&Qno_conversion);
6519
6520   Qundecided = intern ("undecided");
6521   staticpro (&Qundecided);
6522
6523   Qcoding_system_p = intern ("coding-system-p");
6524   staticpro (&Qcoding_system_p);
6525
6526   Qcoding_system_error = intern ("coding-system-error");
6527   staticpro (&Qcoding_system_error);
6528
6529   Fput (Qcoding_system_error, Qerror_conditions,
6530         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6531   Fput (Qcoding_system_error, Qerror_message,
6532         build_string ("Invalid coding system"));
6533
6534   Qcoding_category = intern ("coding-category");
6535   staticpro (&Qcoding_category);
6536   Qcoding_category_index = intern ("coding-category-index");
6537   staticpro (&Qcoding_category_index);
6538
6539   Vcoding_category_table
6540     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6541   staticpro (&Vcoding_category_table);
6542   {
6543     int i;
6544     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6545       {
6546         XVECTOR (Vcoding_category_table)->contents[i]
6547           = intern (coding_category_name[i]);
6548         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6549               Qcoding_category_index, make_number (i));
6550       }
6551   }
6552
6553   Qtranslation_table = intern ("translation-table");
6554   staticpro (&Qtranslation_table);
6555   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6556
6557   Qtranslation_table_id = intern ("translation-table-id");
6558   staticpro (&Qtranslation_table_id);
6559
6560   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6561   staticpro (&Qtranslation_table_for_decode);
6562
6563   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6564   staticpro (&Qtranslation_table_for_encode);
6565
6566   Qsafe_chars = intern ("safe-chars");
6567   staticpro (&Qsafe_chars);
6568
6569   Qchar_coding_system = intern ("char-coding-system");
6570   staticpro (&Qchar_coding_system);
6571
6572   /* Intern this now in case it isn't already done.
6573      Setting this variable twice is harmless.
6574      But don't staticpro it here--that is done in alloc.c.  */
6575   Qchar_table_extra_slots = intern ("char-table-extra-slots");
6576   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
6577   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (1));
6578
6579   Qvalid_codes = intern ("valid-codes");
6580   staticpro (&Qvalid_codes);
6581
6582   Qemacs_mule = intern ("emacs-mule");
6583   staticpro (&Qemacs_mule);
6584
6585   Qraw_text = intern ("raw-text");
6586   staticpro (&Qraw_text);
6587
6588   defsubr (&Scoding_system_p);
6589   defsubr (&Sread_coding_system);
6590   defsubr (&Sread_non_nil_coding_system);
6591   defsubr (&Scheck_coding_system);
6592   defsubr (&Sdetect_coding_region);
6593   defsubr (&Sdetect_coding_string);
6594   defsubr (&Sfind_coding_systems_region_internal);
6595   defsubr (&Sdecode_coding_region);
6596   defsubr (&Sencode_coding_region);
6597   defsubr (&Sdecode_coding_string);
6598   defsubr (&Sencode_coding_string);
6599   defsubr (&Sdecode_sjis_char);
6600   defsubr (&Sencode_sjis_char);
6601   defsubr (&Sdecode_big5_char);
6602   defsubr (&Sencode_big5_char);
6603   defsubr (&Sset_terminal_coding_system_internal);
6604   defsubr (&Sset_safe_terminal_coding_system_internal);
6605   defsubr (&Sterminal_coding_system);
6606   defsubr (&Sset_keyboard_coding_system_internal);
6607   defsubr (&Skeyboard_coding_system);
6608   defsubr (&Sfind_operation_coding_system);
6609   defsubr (&Supdate_coding_systems_internal);
6610   defsubr (&Sset_coding_priority_internal);
6611
6612   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6613     "List of coding systems.\n\
6614 \n\
6615 Do not alter the value of this variable manually.  This variable should be\n\
6616 updated by the functions `make-coding-system' and\n\
6617 `define-coding-system-alias'.");
6618   Vcoding_system_list = Qnil;
6619
6620   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6621     "Alist of coding system names.\n\
6622 Each element is one element list of coding system name.\n\
6623 This variable is given to `completing-read' as TABLE argument.\n\
6624 \n\
6625 Do not alter the value of this variable manually.  This variable should be\n\
6626 updated by the functions `make-coding-system' and\n\
6627 `define-coding-system-alias'.");
6628   Vcoding_system_alist = Qnil;
6629
6630   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6631     "List of coding-categories (symbols) ordered by priority.");
6632   {
6633     int i;
6634
6635     Vcoding_category_list = Qnil;
6636     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6637       Vcoding_category_list
6638         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6639                  Vcoding_category_list);
6640   }
6641
6642   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6643     "Specify the coding system for read operations.\n\
6644 It is useful to bind this variable with `let', but do not set it globally.\n\
6645 If the value is a coding system, it is used for decoding on read operation.\n\
6646 If not, an appropriate element is used from one of the coding system alists:\n\
6647 There are three such tables, `file-coding-system-alist',\n\
6648 `process-coding-system-alist', and `network-coding-system-alist'.");
6649   Vcoding_system_for_read = Qnil;
6650
6651   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6652     "Specify the coding system for write operations.\n\
6653 Programs bind this variable with `let', but you should not set it globally.\n\
6654 If the value is a coding system, it is used for encoding of output,\n\
6655 when writing it to a file and when sending it to a file or subprocess.\n\
6656 \n\
6657 If this does not specify a coding system, an appropriate element\n\
6658 is used from one of the coding system alists:\n\
6659 There are three such tables, `file-coding-system-alist',\n\
6660 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6661 For output to files, if the above procedure does not specify a coding system,\n\
6662 the value of `buffer-file-coding-system' is used.");
6663   Vcoding_system_for_write = Qnil;
6664
6665   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6666     "Coding system used in the latest file or process I/O.");
6667   Vlast_coding_system_used = Qnil;
6668
6669   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6670     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6671 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6672 such conversion.");
6673   inhibit_eol_conversion = 0;
6674
6675   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6676     "Non-nil means process buffer inherits coding system of process output.\n\
6677 Bind it to t if the process output is to be treated as if it were a file\n\
6678 read from some filesystem.");
6679   inherit_process_coding_system = 0;
6680
6681   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6682     "Alist to decide a coding system to use for a file I/O operation.\n\
6683 The format is ((PATTERN . VAL) ...),\n\
6684 where PATTERN is a regular expression matching a file name,\n\
6685 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6686 If VAL is a coding system, it is used for both decoding and encoding\n\
6687 the file contents.\n\
6688 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6689 and the cdr part is used for encoding.\n\
6690 If VAL is a function symbol, the function must return a coding system\n\
6691 or a cons of coding systems which are used as above.\n\
6692 \n\
6693 See also the function `find-operation-coding-system'\n\
6694 and the variable `auto-coding-alist'.");
6695   Vfile_coding_system_alist = Qnil;
6696
6697   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6698     "Alist to decide a coding system to use for a process I/O operation.\n\
6699 The format is ((PATTERN . VAL) ...),\n\
6700 where PATTERN is a regular expression matching a program name,\n\
6701 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6702 If VAL is a coding system, it is used for both decoding what received\n\
6703 from the program and encoding what sent to the program.\n\
6704 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6705 and the cdr part is used for encoding.\n\
6706 If VAL is a function symbol, the function must return a coding system\n\
6707 or a cons of coding systems which are used as above.\n\
6708 \n\
6709 See also the function `find-operation-coding-system'.");
6710   Vprocess_coding_system_alist = Qnil;
6711
6712   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6713     "Alist to decide a coding system to use for a network I/O operation.\n\
6714 The format is ((PATTERN . VAL) ...),\n\
6715 where PATTERN is a regular expression matching a network service name\n\
6716 or is a port number to connect to,\n\
6717 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6718 If VAL is a coding system, it is used for both decoding what received\n\
6719 from the network stream and encoding what sent to the network stream.\n\
6720 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6721 and the cdr part is used for encoding.\n\
6722 If VAL is a function symbol, the function must return a coding system\n\
6723 or a cons of coding systems which are used as above.\n\
6724 \n\
6725 See also the function `find-operation-coding-system'.");
6726   Vnetwork_coding_system_alist = Qnil;
6727
6728   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6729     "Coding system to use with system messages.");
6730   Vlocale_coding_system = Qnil;
6731
6732   /* The eol mnemonics are reset in startup.el system-dependently.  */
6733   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6734     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6735   eol_mnemonic_unix = build_string (":");
6736
6737   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6738     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6739   eol_mnemonic_dos = build_string ("\\");
6740
6741   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6742     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6743   eol_mnemonic_mac = build_string ("/");
6744
6745   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6746     "*String displayed in mode line when end-of-line format is not yet determined.");
6747   eol_mnemonic_undecided = build_string (":");
6748
6749   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6750     "*Non-nil enables character translation while encoding and decoding.");
6751   Venable_character_translation = Qt;
6752
6753   DEFVAR_LISP ("standard-translation-table-for-decode",
6754     &Vstandard_translation_table_for_decode,
6755     "Table for translating characters while decoding.");
6756   Vstandard_translation_table_for_decode = Qnil;
6757
6758   DEFVAR_LISP ("standard-translation-table-for-encode",
6759     &Vstandard_translation_table_for_encode,
6760     "Table for translationg characters while encoding.");
6761   Vstandard_translation_table_for_encode = Qnil;
6762
6763   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6764     "Alist of charsets vs revision numbers.\n\
6765 While encoding, if a charset (car part of an element) is found,\n\
6766 designate it with the escape sequence identifing revision (cdr part of the element).");
6767   Vcharset_revision_alist = Qnil;
6768
6769   DEFVAR_LISP ("default-process-coding-system",
6770                &Vdefault_process_coding_system,
6771     "Cons of coding systems used for process I/O by default.\n\
6772 The car part is used for decoding a process output,\n\
6773 the cdr part is used for encoding a text to be sent to a process.");
6774   Vdefault_process_coding_system = Qnil;
6775
6776   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6777     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6778 This is a vector of length 256.\n\
6779 If Nth element is non-nil, the existence of code N in a file\n\
6780 \(or output of subprocess) doesn't prevent it to be detected as\n\
6781 a coding system of ISO 2022 variant which has a flag\n\
6782 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6783 or reading output of a subprocess.\n\
6784 Only 128th through 159th elements has a meaning.");
6785   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6786
6787   DEFVAR_LISP ("select-safe-coding-system-function",
6788                &Vselect_safe_coding_system_function,
6789     "Function to call to select safe coding system for encoding a text.\n\
6790 \n\
6791 If set, this function is called to force a user to select a proper\n\
6792 coding system which can encode the text in the case that a default\n\
6793 coding system used in each operation can't encode the text.\n\
6794 \n\
6795 The default value is `select-safe-coding-system' (which see).");
6796   Vselect_safe_coding_system_function = Qnil;
6797
6798   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
6799     "Char-table containing safe coding systems of each characters.\n\
6800 Each element doesn't include such generic coding systems that can\n\
6801 encode any characters.   They are in the first extra slot.");
6802   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
6803
6804   DEFVAR_BOOL ("inhibit-iso-escape-detection",
6805                &inhibit_iso_escape_detection,
6806     "If non-nil, Emacs ignores ISO2022's escape sequence on code detection.\n\
6807 \n\
6808 By default, on reading a file, Emacs tries to detect how the text is\n\
6809 encoded.  This code detection is sensitive to escape sequences.  If\n\
6810 the sequence is valid as ISO2022, the code is determined as one of\n\
6811 the ISO2022 encodings, and the file is decoded by the corresponding\n\
6812 coding system (e.g. `iso-2022-7bit').\n\
6813 \n\
6814 However, there may be a case that you want to read escape sequences in\n\
6815 a file as is.  In such a case, you can set this variable to non-nil.\n\
6816 Then, as the code detection ignores any escape sequences, no file is\n\
6817 detected as encoded in some ISO2022 encoding.  The result is that all\n\
6818 escape sequences become visible in a buffer.\n\
6819 \n\
6820 The default value is nil, and it is strongly recommended not to change\n\
6821 it.  That is because many Emacs Lisp source files that contain\n\
6822 non-ASCII characters are encoded by the coding system `iso-2022-7bit'\n\
6823 in Emacs's distribution, and they won't be decoded correctly on\n\
6824 reading if you suppress escape sequence detection.\n\
6825 \n\
6826 The other way to read escape sequences in a file without decoding is\n\
6827 to explicitly specify some coding system that doesn't use ISO2022's\n\
6828 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].");
6829   inhibit_iso_escape_detection = 0;
6830 }
6831
6832 char *
6833 emacs_strerror (error_number)
6834      int error_number;
6835 {
6836   char *str;
6837
6838   synchronize_system_messages_locale ();
6839   str = strerror (error_number);
6840
6841   if (! NILP (Vlocale_coding_system))
6842     {
6843       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6844                                                       Vlocale_coding_system,
6845                                                       0);
6846       str = (char *) XSTRING (dec)->data;
6847     }
6848
6849   return str;
6850 }
6851
6852 #endif /* emacs */
6853